diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ddfaa170..2ef3bca4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,9 +15,9 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version-file: 'go.mod' - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.5 + version: v2.12.2 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76b..5dcc90bb 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8429bf2d..b3b66dc5 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version-file: 'go.mod' - name: Install the latest version of kind run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834d33a0..07fbf7c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version-file: 'go.mod' - name: Running Tests run: | diff --git a/.gitignore b/.gitignore index 2b0c6e44..08f06784 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,8 @@ # Output of the go coverage tool, specifically when used with LiteIDE *.out -# Dependency directories (remove the comment below to include it) -# vendor/ +# Dependency directories +vendor/ # Go workspace file go.work @@ -25,3 +25,12 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ + +# GoReleaser build output +dist/ + +# Local datumctl plugin build binary +/datumctl-compute diff --git a/.golangci.yml b/.golangci.yml index a7246fbb..736f902f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -35,6 +35,19 @@ linters: - dupl - lll path: internal/* + # field.ErrorList{} is the idiomatic Kubernetes validation init pattern; + # preallocating requires knowing the error count in advance which is not + # possible in recursive validation helpers. + - linters: + - prealloc + path: internal/validation/ + # Test helpers that build slices via append are clearer without prealloc. + - linters: + - prealloc + path: internal/controller/instancecontrol/ + - linters: + - errcheck + path: internal/cmd/.* paths: - third_party$ - builtin$ diff --git a/.goreleaser-plugin.yaml b/.goreleaser-plugin.yaml new file mode 100644 index 00000000..69fb233c --- /dev/null +++ b/.goreleaser-plugin.yaml @@ -0,0 +1,51 @@ +# yaml-language-server: $schema=https://goreleaser.com/static/schema.json +version: 2 + +project_name: datumctl-compute + +before: + hooks: + - go mod tidy + +builds: + - id: datumctl-compute + binary: datumctl-compute + main: ./cmd/datumctl-compute + env: + - CGO_ENABLED=0 + goos: + - linux + - darwin + - windows + goarch: + - amd64 + - arm64 + ldflags: + - "-X main.version=v{{.Version}}" + +archives: + - id: datumctl-compute + builds: + - datumctl-compute + format: tar.gz + name_template: >- + {{ .ProjectName }}_ + {{- title .Os }}_ + {{- if eq .Arch "amd64" }}x86_64 + {{- else if eq .Arch "386" }}i386 + {{- else }}{{ .Arch }}{{ end }} + {{- if .Arm }}v{{ .Arm }}{{ end }} + format_overrides: + - goos: windows + format: zip + +checksum: + name_template: "checksums.txt" + +changelog: + sort: asc + filters: + exclude: + - "^docs:" + - "^test:" + - "^chore:" diff --git a/Makefile b/Makefile index 61744a36..3d6a3e2e 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ KUSTOMIZE_VERSION ?= v5.5.0 CONTROLLER_TOOLS_VERSION ?= v0.16.4 DEFAULTER_GEN_VERSION ?= v0.32.3 ENVTEST_VERSION ?= release-0.19 -GOLANGCI_LINT_VERSION ?= v2.1.5 +GOLANGCI_LINT_VERSION ?= v2.12.2 # renovate: datasource=go depName=fybrik.io/crdoc CRDOC_VERSION ?= v0.6.4 diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 00000000..bcfbb0f8 --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,481 @@ +version: '3' + +# ─── Variables ────────────────────────────────────────────────────────────── + +vars: + # Karmada Helm chart version to install (karmada-charts/karmada) + KARMADA_VERSION: v1.16.0 + + # karmadactl CLI version for cluster registration + KARMADACTL_VERSION: v1.16.0 + + # Chainsaw version for e2e testing (kyverno/chainsaw) + CHAINSAW_VERSION: v0.2.15 + + # Local tool directory (mirrors Makefile convention) + LOCALBIN: '{{.ROOT_DIR}}/bin' + KARMADACTL: '{{.ROOT_DIR}}/bin/karmadactl' + CHAINSAW: '{{.ROOT_DIR}}/bin/chainsaw' + + # Kind cluster names + KIND_CONTROL_PLANE: compute-control-plane + KIND_POP_DFW: compute-pop-dfw + KIND_POP_ORD: compute-pop-ord + + # All cluster names (for CRD installation loops) + KIND_ALL_CLUSTERS: '{{.KIND_CONTROL_PLANE}} {{.KIND_POP_DFW}} {{.KIND_POP_ORD}}' + + # Working directory for e2e artefacts (gitignored) + E2E_DIR: '{{.ROOT_DIR}}/tmp/e2e' + KUBECONFIG_DIR: '{{.ROOT_DIR}}/tmp/e2e/kubeconfigs' + + # Fixed NodePort for the Karmada API server. + # The Kind management cluster is created with an extraPortMapping for this port + # so it is reachable at https://localhost:32443 from the developer's machine. + KARMADA_API_NODEPORT: "32443" + +# ─── Tasks ────────────────────────────────────────────────────────────────── + +tasks: + + default: + cmds: + - task --list + silent: true + + # ════════════════════════════════════════════════════════════════════════ + # e2e environment lifecycle + # ════════════════════════════════════════════════════════════════════════ + + e2e:up: + desc: "Create the full local Kind+Karmada e2e environment (idempotent)" + cmds: + - task: e2e:tools + - task: e2e:clusters:create + - task: e2e:karmada:install + - task: e2e:karmada:configure + - task: e2e:karmada:join-clusters + - task: e2e:crds:install + - cmd: | + echo "" + echo "╔══════════════════════════════════════════════════════════╗" + echo "║ e2e environment ready ║" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Control plane: {{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "║ Karmada API: {{.KUBECONFIG_DIR}}/karmada.yaml" + echo "║ POP DFW: {{.KUBECONFIG_DIR}}/pop-dfw.yaml" + echo "║ POP ORD: {{.KUBECONFIG_DIR}}/pop-ord.yaml" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Export for kubectl: ║" + echo "║ export KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "╚══════════════════════════════════════════════════════════╝" + silent: false + + e2e:down: + desc: "Tear down the local e2e environment" + cmds: + - kind delete cluster --name {{.KIND_CONTROL_PLANE}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_DFW}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_ORD}} 2>/dev/null || true + - rm -rf {{.E2E_DIR}} + - cmd: echo "✓ e2e environment torn down" + silent: false + + e2e:test: + desc: "Run Chainsaw e2e tests against the local Kind+Karmada environment" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + test/e2e/ \ + {{.CLI_ARGS}} + + e2e:test:filter: + desc: "Run a subset of e2e tests by name regex (e.g. task e2e:test:filter -- --include-test-regex federation)" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + {{.CLI_ARGS}} \ + test/e2e/ + + # ════════════════════════════════════════════════════════════════════════ + # Tool installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:tools: + desc: "Install e2e-specific tooling (karmadactl, chainsaw, helm repo)" + cmds: + - task: e2e:tools:karmadactl + - task: e2e:tools:chainsaw + - task: e2e:tools:helm-repo + + e2e:tools:karmadactl: + desc: "Download karmadactl {{.KARMADACTL_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.KARMADACTL}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/karmada-io/karmada/releases/download/{{.KARMADACTL_VERSION}}/karmadactl-${OS}-${ARCH}.tgz" + echo "Downloading karmadactl {{.KARMADACTL_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} karmadactl + chmod +x {{.KARMADACTL}} + echo "karmadactl installed → {{.KARMADACTL}}" + else + echo "karmadactl already present at {{.KARMADACTL}}" + fi + status: + - test -f {{.KARMADACTL}} + + e2e:tools:chainsaw: + desc: "Download chainsaw {{.CHAINSAW_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.CHAINSAW}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/kyverno/chainsaw/releases/download/{{.CHAINSAW_VERSION}}/chainsaw_${OS}_${ARCH}.tar.gz" + echo "Downloading chainsaw {{.CHAINSAW_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} chainsaw + chmod +x {{.CHAINSAW}} + echo "chainsaw installed → {{.CHAINSAW}}" + else + echo "chainsaw already present at {{.CHAINSAW}}" + fi + status: + - test -f {{.CHAINSAW}} + + e2e:tools:helm-repo: + desc: "Add/update karmada-charts Helm repository" + cmds: + - | + if ! helm repo list 2>/dev/null | grep -q karmada-charts; then + helm repo add karmada-charts https://raw.githubusercontent.com/karmada-io/karmada/master/charts + echo "Added karmada-charts Helm repository" + fi + helm repo update karmada-charts + status: + - helm repo list 2>/dev/null | grep -q karmada-charts + + # ════════════════════════════════════════════════════════════════════════ + # Kind cluster management + # ════════════════════════════════════════════════════════════════════════ + + e2e:clusters:create: + desc: "Create all Kind clusters (idempotent)" + cmds: + # Management / control-plane cell cluster — needs extraPortMappings for + # the Karmada API server NodePort so it is accessible at localhost:32443. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_CONTROL_PLANE}}" + KIND_CONFIG: hack/e2e/kind-control-plane.yaml + # POP cell clusters — default Kind config is sufficient. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + KIND_CONFIG: "" + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + KIND_CONFIG: "" + - mkdir -p {{.KUBECONFIG_DIR}} + - task: _e2e:kubeconfigs:export + + _e2e:cluster:create: + internal: true + cmds: + - | + if kind get clusters 2>/dev/null | grep -qx '{{.CLUSTER_NAME}}'; then + echo "Kind cluster '{{.CLUSTER_NAME}}' already exists — skipping" + else + echo "Creating Kind cluster '{{.CLUSTER_NAME}}'..." + CONFIG_FLAG="" + if [ -n "{{.KIND_CONFIG}}" ]; then + CONFIG_FLAG="--config {{.KIND_CONFIG}}" + fi + kind create cluster \ + --name {{.CLUSTER_NAME}} \ + $CONFIG_FLAG \ + --wait 90s + fi + + _e2e:kubeconfigs:export: + internal: true + desc: "Export Kind kubeconfigs and create Docker-IP variants for cross-cluster use" + cmds: + # Standard kubeconfigs (localhost-based, for developer kubectl use) + - kind export kubeconfig --name {{.KIND_CONTROL_PLANE}} --kubeconfig {{.KUBECONFIG_DIR}}/control-plane.yaml + - kind export kubeconfig --name {{.KIND_POP_DFW}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-dfw.yaml + - kind export kubeconfig --name {{.KIND_POP_ORD}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-ord.yaml + # Docker-IP kubeconfigs (used by Karmada controller, running inside Docker, + # to reach POP cell API servers across the kind bridge network) + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml \ + {{.KIND_POP_DFW}} + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord-internal.yaml \ + {{.KIND_POP_ORD}} + + # ════════════════════════════════════════════════════════════════════════ + # Karmada installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:install: + desc: "Install Karmada into the management cluster via Helm (idempotent)" + cmds: + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get ns karmada-system &>/dev/null; then + echo "Karmada already installed (karmada-system namespace exists)" + else + echo "Installing Karmada {{.KARMADA_VERSION}} via Helm..." + helm install karmada karmada-charts/karmada \ + --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + --namespace karmada-system \ + --create-namespace \ + --version {{.KARMADA_VERSION}} \ + --set apiServer.serviceType=NodePort \ + --set apiServer.nodePort={{.KARMADA_API_NODEPORT}} \ + --wait \ + --timeout 5m + echo "Karmada installed" + fi + - task: _e2e:karmada:build-kubeconfig + + e2e:karmada:configure: + desc: "Apply federation component config to the Karmada API server (idempotent)" + cmds: + - | + echo "Applying federation component to Karmada..." + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml apply \ + -k config/components/federation/ + echo "Federation component applied" + + _e2e:karmada:build-kubeconfig: + internal: true + desc: "Extract Karmada kubeconfig from secret and patch server to localhost:{{.KARMADA_API_NODEPORT}}" + cmds: + - | + echo "Building Karmada kubeconfig → {{.KUBECONFIG_DIR}}/karmada.yaml" + # Extract raw kubeconfig from the secret the Helm chart creates + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get secret karmada-kubeconfig \ + -n karmada-system \ + -o jsonpath='{.data.kubeconfig}' \ + | base64 -d > {{.KUBECONFIG_DIR}}/karmada-raw.yaml + # Rewrite the server address to the NodePort exposed on localhost + python3 - {{.KUBECONFIG_DIR}}/karmada-raw.yaml {{.KUBECONFIG_DIR}}/karmada.yaml 127.0.0.1 {{.KARMADA_API_NODEPORT}} << 'PYEOF' + import sys, yaml + + src, dst, host, port = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + + with open(src) as f: + cfg = yaml.safe_load(f) + + for cluster in cfg.get('clusters', []): + old = cluster['cluster'].get('server', '') + cluster['cluster']['server'] = f'https://{host}:{port}' + # The cert is for the internal cluster IP, so skip TLS verification. + # This is a local dev-only environment. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + print(f" karmada server: {old} → https://{host}:{port}", file=sys.stderr) + + with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + PYEOF + rm {{.KUBECONFIG_DIR}}/karmada-raw.yaml + + # ════════════════════════════════════════════════════════════════════════ + # POP cell cluster registration + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:join-clusters: + desc: "Register POP cell clusters with Karmada and apply city-code labels" + cmds: + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + CITY_CODE: dfw + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml" + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + CITY_CODE: ord + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord-internal.yaml" + + _e2e:karmada:join-cluster: + internal: true + cmds: + # ── Register with karmadactl join ────────────────────────────────── + # We pass the EXTERNAL kubeconfig (localhost-based) here so karmadactl + # can reach the member cluster from this macOS host to set up initial + # RBAC. The stored secret is patched below to the Docker-IP variant. + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + get cluster {{.CLUSTER_NAME}} &>/dev/null; then + echo "Cluster '{{.CLUSTER_NAME}}' already registered in Karmada — skipping join" + else + echo "Joining '{{.CLUSTER_NAME}}' to Karmada..." + {{.KARMADACTL}} join {{.CLUSTER_NAME}} \ + --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --cluster-kubeconfig={{.EXTERNAL_KUBECONFIG}} \ + --cluster-context=kind-{{.CLUSTER_NAME}} + echo "Cluster '{{.CLUSTER_NAME}}' registered" + fi + # ── Patch cluster secret → Docker-IP kubeconfig ─────────────────── + # The Karmada controller manager runs inside Docker; it cannot use + # localhost to reach POP cell API servers. We update the stored secret + # with a kubeconfig whose server address uses the Kind container IP so + # container-to-container communication works across the kind bridge. + - | + hack/e2e/patch-cluster-secret.sh \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.CLUSTER_NAME}} \ + {{.INTERNAL_KUBECONFIG}} + # ── Apply city-code label ────────────────────────────────────────── + - | + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + label cluster {{.CLUSTER_NAME}} \ + topology.datum.net/city-code={{.CITY_CODE}} \ + --overwrite + echo "Labeled cluster '{{.CLUSTER_NAME}}' with topology.datum.net/city-code={{.CITY_CODE}}" + + # ════════════════════════════════════════════════════════════════════════ + # CRD installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:crds:install: + desc: "Install compute + NSO CRDs to all clusters" + cmds: + - task: _e2e:crds:compute + - task: _e2e:crds:nso + + _e2e:crds:compute: + internal: true + desc: "Apply compute CRDs to all clusters and the Karmada API server" + cmds: + # All three Kind clusters + the Karmada API server get the compute CRDs. + # The Karmada API server needs them so it can store and propagate + # WorkloadDeployment objects. + - | + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing compute CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k config/base/crd --server-side + done + + _e2e:crds:nso: + internal: true + desc: "Apply NSO CRDs to control-plane and POP cell clusters" + cmds: + # NSO CRDs (NetworkBinding, SubnetClaim, etc.) are installed on the + # control-plane as well as POP cells. The control-plane operator needs them + # so that Subnet/SubnetClaim informer watches can start without cache errors, + # even though NSO controllers themselves only run on POP cells. + - | + go mod download go.datum.net/network-services-operator + NSO_VERSION=$(go list -m -json go.datum.net/network-services-operator \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['Version'])") + NSO_CRD_PATH="$(go env GOMODCACHE)/go.datum.net/network-services-operator@${NSO_VERSION}/config/crd" + echo "NSO CRDs from: ${NSO_CRD_PATH}" + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing NSO CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k "${NSO_CRD_PATH}" --server-side + done + + # ════════════════════════════════════════════════════════════════════════ + # Operator lifecycle (background processes for federation e2e) + # ════════════════════════════════════════════════════════════════════════ + + e2e:operator:start: + desc: "Start management (control-plane) and cell (pop-dfw) operator instances in the background" + cmds: + - mkdir -p {{.E2E_DIR}}/logs {{.E2E_DIR}}/pids + - | + echo "Starting management operator (control-plane)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-cell-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9091 \ + > {{.E2E_DIR}}/logs/operator-management.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-management.pid + echo "Management operator PID: $!" + - | + echo "Waiting for management operator health check on :9091..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9091/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: management operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-management.log || true + exit 1 + fi + sleep 1 + done + echo "Management operator is healthy" + - | + echo "Starting cell operator (pop-dfw)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-management-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9092 \ + > {{.E2E_DIR}}/logs/operator-cell-dfw.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-cell-dfw.pid + echo "Cell operator PID: $!" + - | + echo "Waiting for cell operator health check on :9092..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9092/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: cell operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-cell-dfw.log || true + exit 1 + fi + sleep 1 + done + echo "Cell operator is healthy" + + e2e:operator:stop: + desc: "Stop background operator instances" + cmds: + - | + for PIDFILE in \ + {{.E2E_DIR}}/pids/operator-management.pid \ + {{.E2E_DIR}}/pids/operator-cell-dfw.pid; do + if [ -f "$PIDFILE" ]; then + PID=$(cat "$PIDFILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Stopping PID $PID ($(basename $PIDFILE .pid))..." + kill -TERM "$PID" || true + else + echo "Process $PID ($(basename $PIDFILE .pid)) is not running" + fi + rm -f "$PIDFILE" + else + echo "PID file not found: $PIDFILE" + fi + done diff --git a/api/v1alpha/annotations.go b/api/v1alpha/annotations.go index a945547e..b5598d18 100644 --- a/api/v1alpha/annotations.go +++ b/api/v1alpha/annotations.go @@ -4,4 +4,9 @@ const ( AnnotationNamespace = "compute.datumapis.com" SSHKeysAnnotation = AnnotationNamespace + "/ssh-keys" + + // RestartedAtAnnotation may be set on an InstanceTemplateSpec's annotations + // (an RFC3339 timestamp) to request a rolling restart. It is included in the + // template hash and triggers the controller's ordered instance roll. + RestartedAtAnnotation = AnnotationNamespace + "/restartedAt" ) diff --git a/api/v1alpha/instance_types.go b/api/v1alpha/instance_types.go index 57e7f560..f497f454 100644 --- a/api/v1alpha/instance_types.go +++ b/api/v1alpha/instance_types.go @@ -107,6 +107,26 @@ type SandboxContainer struct { // +kubebuilder:validation:Required Image string `json:"image"` + // Entrypoint array to run in the container image, overriding the image's + // ENTRYPOINT. Each element is a separate token, not a shell command — to run a + // shell command use: ["sh", "-c", "my command"]. + // + // If not provided, the container image's own ENTRYPOINT is used. + // + // +kubebuilder:validation:Optional + Command []string `json:"command,omitempty"` + + // Arguments to the entrypoint, overriding the image's CMD. Combined with + // Command: when Command is also set the resulting invocation is + // append(Command, Args...). When only Args is set it overrides CMD while + // preserving the image's ENTRYPOINT. + // + // If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + // are used unchanged. + // + // +kubebuilder:validation:Optional + Args []string `json:"args,omitempty"` + // List of environment variables to set in the container. // // +kubebuilder:validation:Optional @@ -384,8 +404,10 @@ const ( // InstanceReady indicates that the instance is ready InstanceReady = "Ready" - // InstanceRunning indicates that the instance is running - InstanceRunning = "Running" + // InstanceAvailable indicates that the instance is available. It is True + // while the instance is serving (including when scaled to zero); it does + // not assert that a process is actively running at this instant. + InstanceAvailable = "Available" // InstanceProgrammed indicates that the instance has been programmed InstanceProgrammed = "Programmed" @@ -400,26 +422,58 @@ const ( InstanceQuotaGrantedReasonQuotaExceeded = "QuotaExceeded" InstanceQuotaGrantedReasonValidationFailed = "ValidationFailed" InstanceProgrammedReasonPendingQuota = "PendingQuota" + + // InstanceQuotaGrantedReasonQuotaDisabled indicates quota enforcement is + // intentionally disabled: no credential path was configured. + InstanceQuotaGrantedReasonQuotaDisabled = "QuotaDisabled" + + // InstanceQuotaGrantedReasonBackendUnavailable indicates quota enforcement + // is configured but the Milo quota backend is unreachable (network error, + // TLS failure, 401/503). + InstanceQuotaGrantedReasonBackendUnavailable = "QuotaBackendUnavailable" + + // InstanceQuotaGrantedReasonProjectNotFound indicates the Milo project + // referenced by this instance does not exist (404 on the project control plane). + InstanceQuotaGrantedReasonProjectNotFound = "QuotaProjectNotFound" + + // InstanceQuotaGrantedReasonNamespaceNotFound indicates the claim namespace + // does not exist on the Milo project control plane (FM-5). + InstanceQuotaGrantedReasonNamespaceNotFound = "QuotaNamespaceNotFound" + + // InstanceQuotaGrantedReasonMisconfigured indicates the ResourceClaim was + // rejected by the Milo admission plugin (403/422): ResourceRegistration absent + // or claimingRules mismatch. + InstanceQuotaGrantedReasonMisconfigured = "QuotaMisconfigured" + + // InstanceQuotaGrantedReasonProjectIDUnresolvable indicates the namespace + // label required to derive the Milo project ID is missing or unreadable. + InstanceQuotaGrantedReasonProjectIDUnresolvable = "QuotaProjectIDUnresolvable" + + // InstanceQuotaGrantedReasonNoBudget indicates the ResourceClaim exists and + // is pending because no AllowanceBucket has been configured for the project. + // This is distinct from PendingEvaluation (claim not yet created or first eval + // in progress) and from QuotaExceeded (explicitly denied). + InstanceQuotaGrantedReasonNoBudget = "QuotaNoBudget" ) const ( // InstanceReadyReasonSchedulingGatesPresent indicates that the instance is not ready because scheduling gates are present. InstanceReadyReasonSchedulingGatesPresent = "SchedulingGatesPresent" - // InstanceReadyReasonRunning indicates that the instance is running - InstanceReadyReasonRunning = "Running" + // InstanceReadyReasonAvailable indicates that the instance is available + InstanceReadyReasonAvailable = "Available" - // InstanceRunningReasonStopped indicates that the instance is stopped - InstanceRunningReasonStopped = "Stopped" + // InstanceAvailableReasonStopped indicates that the instance is stopped + InstanceAvailableReasonStopped = "Stopped" - // InstanceRunningReasonStarting indicates that the instance is starting - InstanceRunningReasonStarting = "Starting" + // InstanceAvailableReasonStarting indicates that the instance is starting + InstanceAvailableReasonStarting = "Starting" - // InstanceRunningReasonStopping indicates that the instance is stopping - InstanceRunningReasonStopping = "Stopping" + // InstanceAvailableReasonStopping indicates that the instance is stopping + InstanceAvailableReasonStopping = "Stopping" - // InstanceRunningReasonRunning indicates that the instance is running - InstanceRunningReasonRunning = "Running" + // InstanceAvailableReasonAvailable indicates that the instance is available + InstanceAvailableReasonAvailable = "Available" // InstanceProgrammedReasonPendingProgramming indicates that the instance has not been programmed InstanceProgrammedReasonPendingProgramming = "PendingProgramming" @@ -429,6 +483,21 @@ const ( // InstanceProgrammedReasonProgrammed indicates that the instance has been programmed InstanceProgrammedReasonProgrammed = "Programmed" + + // InstanceProgrammedReasonImageUnavailable indicates the instance image could + // not be pulled. Set by the infrastructure provider. + // User action required: fix the image reference in the workload spec. + InstanceProgrammedReasonImageUnavailable = "ImageUnavailable" + + // InstanceProgrammedReasonInstanceCrashing indicates the instance keeps + // crashing on startup. Set by the infrastructure provider. + // User action required: fix the workload (check logs for crash details). + InstanceProgrammedReasonInstanceCrashing = "InstanceCrashing" + + // InstanceProgrammedReasonConfigurationError indicates the instance failed to + // start due to a bad configuration. Set by the infrastructure provider. + // User action required: fix the workload configuration. + InstanceProgrammedReasonConfigurationError = "ConfigurationError" ) type InstanceTemplateSpec struct { @@ -453,6 +522,7 @@ type InstanceTemplateSpec struct { // +kubebuilder:printcolumn:name="Network IP",type=string,JSONPath=`.status.networkInterfaces[0].assignments.networkIP` // +kubebuilder:printcolumn:name="External IP",type=string,JSONPath=`.status.networkInterfaces[0].assignments.externalIP` // +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].message`,priority=1 +// +kubebuilder:printcolumn:name="Quota",type=string,JSONPath=`.status.conditions[?(@.type=="QuotaGranted")].reason`,priority=1 type Instance struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -462,7 +532,7 @@ type Instance struct { // Status defines the current state of an Instance. // - // +kubebuilder:default={conditions:{{type:"Programmed",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"Running",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"Ready",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"QuotaGranted",status:"Unknown",reason:"PendingEvaluation",message:"Waiting for quota evaluation",lastTransitionTime:"1970-01-01T00:00:00Z"}}} + // +kubebuilder:default={conditions:{{type:"Programmed",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"Available",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"Ready",status:"Unknown",reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"},{type:"QuotaGranted",status:"Unknown",reason:"PendingEvaluation",message:"Waiting for quota evaluation",lastTransitionTime:"1970-01-01T00:00:00Z"}}} Status InstanceStatus `json:"status,omitempty"` } diff --git a/api/v1alpha/labels.go b/api/v1alpha/labels.go index e1dac308..3b0d8b33 100644 --- a/api/v1alpha/labels.go +++ b/api/v1alpha/labels.go @@ -5,6 +5,25 @@ const ( WorkloadUIDLabel = LabelNamespace + "/workload-uid" WorkloadDeploymentUIDLabel = LabelNamespace + "/workload-deployment-uid" + // WorkloadDeploymentNameLabel carries the WorkloadDeployment name on each + // Instance. Unlike WorkloadDeploymentUIDLabel — which carries the + // edge/Karmada UID and therefore differs across federation planes — + // WorkloadDeploymentNameLabel is identical in the project cluster, Karmada, + // and on the edge, making it safe for cross-plane owner-ref resolution and + // CLI lookup. + WorkloadDeploymentNameLabel = LabelNamespace + "/workload-deployment-name" InstanceIndexLabel = LabelNamespace + "/instance-index" + + // CityCodeLabel carries the city code of the WorkloadDeployment that owns + // an Instance, matching WorkloadDeploymentSpec.CityCode. + CityCodeLabel = LabelNamespace + "/city-code" + + // WorkloadNameLabel carries the name of the Workload that an Instance + // ultimately belongs to, sourced from WorkloadDeploymentSpec.WorkloadRef.Name. + WorkloadNameLabel = LabelNamespace + "/workload-name" + + // PlacementNameLabel carries the placement name from the Workload that drove + // this Instance's deployment, sourced from WorkloadDeploymentSpec.PlacementName. + PlacementNameLabel = LabelNamespace + "/placement-name" ) diff --git a/api/v1alpha/workloaddeployment_types.go b/api/v1alpha/workloaddeployment_types.go index 7da27c89..a00d400c 100644 --- a/api/v1alpha/workloaddeployment_types.go +++ b/api/v1alpha/workloaddeployment_types.go @@ -49,14 +49,28 @@ type WorkloadDeploymentStatus struct { // The number of instances created Replicas int32 `json:"replicas"` - // The number of instances which have the latest workload settings applied. + // The number of instances which have the latest workload settings applied + // and are programmed (a subset of UpdatedReplicas that are ready to serve). CurrentReplicas int32 `json:"currentReplicas"` + // The number of instances updated to the latest template revision (their + // observed template hash matches the desired template), regardless of + // readiness. Lags Replicas during a rolling update or restart, then catches + // back up — making an in-progress roll observable. + UpdatedReplicas int32 `json:"updatedReplicas"` + // The desired number of instances DesiredReplicas int32 `json:"desiredReplicas"` // The number of instances which are ready. ReadyReplicas int32 `json:"readyReplicas"` + + // The most recent generation observed by the deployment controller. When + // this matches metadata.generation, the controller has reconciled the + // latest spec (e.g. a restart request). + // + // +kubebuilder:validation:Optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` } const ( diff --git a/api/v1alpha/zz_generated.deepcopy.go b/api/v1alpha/zz_generated.deepcopy.go index 8ecc1bae..926e222c 100644 --- a/api/v1alpha/zz_generated.deepcopy.go +++ b/api/v1alpha/zz_generated.deepcopy.go @@ -651,6 +651,16 @@ func (in *ResourceMetricSource) DeepCopy() *ResourceMetricSource { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SandboxContainer) DeepCopyInto(out *SandboxContainer) { *out = *in + if in.Command != nil { + in, out := &in.Command, &out.Command + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) diff --git a/cmd/datumctl-compute/main.go b/cmd/datumctl-compute/main.go new file mode 100644 index 00000000..4571a580 --- /dev/null +++ b/cmd/datumctl-compute/main.go @@ -0,0 +1,26 @@ +package main + +import ( + "os" + + "go.datum.net/datumctl/plugin" + + "go.datum.net/compute/internal/cmd/compute" +) + +// version is set at build time via ldflags. +var version = "dev" + +func main() { + plugin.ServeManifest(plugin.Manifest{ + Name: "compute", + Version: version, + Description: "Deploy and manage containerized workloads on Datum Cloud", + APIVersion: 1, + MinAPIVersion: 1, + }) + + if err := compute.Command().Execute(); err != nil { + os.Exit(1) + } +} diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc9..01d3eddd 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -8,6 +8,8 @@ import ( "flag" "fmt" "os" + "strings" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -18,29 +20,42 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" + "go.datum.net/compute/internal/features" + quotametrics "go.datum.net/compute/internal/quota" computewebhook "go.datum.net/compute/internal/webhook" computev1alphawebhooks "go.datum.net/compute/internal/webhook/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" multiclusterproviders "go.miloapis.com/milo/pkg/multicluster-runtime" milomulticluster "go.miloapis.com/milo/pkg/multicluster-runtime/milo" + corev1 "k8s.io/api/core/v1" // +kubebuilder:scaffold:imports ) +// singleClusterName is the fixed cluster name that mcsingle.New registers. +// All single-mode wiring that references this cluster must use this constant. +const singleClusterName = "single" + var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") @@ -51,6 +66,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // federationRestConfig holds the REST config for the Karmada federation control + // plane. It is populated from --federation-kubeconfig when set, and is nil + // when the flag is omitted. + federationRestConfig *rest.Config ) func init() { @@ -61,22 +81,45 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } +//nolint:gocyclo // main wires all controller paths; complexity is inherent to startup sequencing func main() { var enableLeaderElection bool var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var federationKubeconfig string + var federationContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&federationKubeconfig, "federation-kubeconfig", "", + "Path to the kubeconfig file for the Karmada federation control plane. "+ + "Required when --enable-management-controllers is set. "+ + "When omitted, federation features are disabled.") + flag.StringVar(&federationContext, "federation-context", "", + "Context to use from the federation kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", false, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector).") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", false, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler).") + + var featureGatesFlag string + flag.StringVar(&featureGatesFlag, "feature-gates", "", + "A set of key=value pairs that describe feature gates for the compute operator. "+ + "Example: --feature-gates=NetworkingIntegration=false. "+ + "Available features: NetworkingIntegration (default=true).") opts := zap.Options{ Development: true, @@ -87,8 +130,47 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if featureGatesFlag != "" { + if err := features.MutableFeatureGate.Set(featureGatesFlag); err != nil { + setupLog.Error(err, "unable to parse feature gates", "feature-gates", featureGatesFlag) + os.Exit(1) + } + } + setupLog.Info("feature gates", "NetworkingIntegration", features.FeatureGate.Enabled(features.NetworkingIntegration)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Load the federation (Karmada) control plane REST config when + // --federation-kubeconfig is provided. When the flag is omitted, + // federationRestConfig remains nil; management controllers will refuse to + // start if --enable-management-controllers is also set. + if federationKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: federationKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: federationContext}, + ) + var err error + federationRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load federation kubeconfig", "path", federationKubeconfig) + os.Exit(1) + } + setupLog.Info("federation kubeconfig loaded", "path", federationKubeconfig) + } + + // Fail loud: management controllers require a federation kubeconfig. Silently + // skipping them when --enable-management-controllers is set would leave + // federation and instance projection broken with no visible signal — the same + // class of failure as the quota P1 issue. An operator who explicitly enables + // management controllers but omits --federation-kubeconfig has a misconfiguration + // that must surface immediately rather than at runtime. + if enableManagementControllers && federationRestConfig == nil { + setupLog.Error(nil, + "management controllers enabled but no federation kubeconfig configured", + "hint", "set --federation-kubeconfig") + os.Exit(1) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -96,24 +178,28 @@ func main() { "buildDate", buildDate, ) - var serverConfig config.WorkloadOperator - var configData []byte - if len(serverConfigFile) > 0 { - var err error - configData, err = os.ReadFile(serverConfigFile) - if err != nil { - setupLog.Error(fmt.Errorf("unable to read server config from %q", serverConfigFile), "") - os.Exit(1) - } - } - - if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { - setupLog.Error(err, "unable to decode server config") + serverConfig, err := loadServerConfig(serverConfigFile) + if err != nil { + setupLog.Error(err, "unable to load server config") os.Exit(1) } setupLog.Info("server config", "config", serverConfig) + quotaRestConfig, err := serverConfig.Discovery.QuotaRestConfig() + if err != nil { + setupLog.Error(err, "unable to load quota REST config") + os.Exit(1) + } + if quotaRestConfig != nil { + setupLog.Info("quota REST config loaded", "path", serverConfig.Discovery.QuotaKubeconfigPath) + quotametrics.EnforcementEnabled.Set(1) + } else { + setupLog.Error(nil, "quota enforcement is DISABLED — workloads will schedule without quota accounting; "+ + "set quotaKubeconfigPath in server config to enable enforcement") + quotametrics.EnforcementEnabled.Set(0) + } + cfg := ctrl.GetConfigOrDie() deploymentCluster, err := cluster.New(cfg, func(o *cluster.Options) { @@ -124,7 +210,9 @@ func main() { os.Exit(1) } - runnables, provider, err := initializeClusterDiscovery(serverConfig, deploymentCluster, scheme) + runnables, provider, edgeClusterName, err := initializeClusterDiscovery( + serverConfig, deploymentCluster, scheme, + ) if err != nil { setupLog.Error(err, "unable to initialize cluster discovery") os.Exit(1) @@ -176,21 +264,65 @@ func main() { os.Exit(1) } - if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workload") - os.Exit(1) + if enableManagementControllers { + if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single federation client shared across all controllers that need to + // read or write to the Karmada federation control plane. This is the hub that + // the management controllers federate through and that edge cells write back to. + // Nil when --federation-kubeconfig is not set (i.e. federation is disabled). + var federationClient client.Client + if federationRestConfig != nil { + federationClient, err = client.New(federationRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create federation client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{ + NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + clusterNameForProject := func(_ string) multicluster.ClusterName { + return multicluster.ClusterName(singleClusterName) + } + instanceReconciler := &controller.InstanceReconciler{FederationClient: federationClient} + err = instanceReconciler.SetupWithManager( + mgr, + quotaRestConfig, + singleModeProjectID(mgr), + singleModeProjectNamespace(mgr), + edgeClusterName, + clusterNameForProject, + ) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // WorkloadDeploymentFederator and InstanceProjector are management-plane + // controllers that run on the control-plane cluster. The fail-loud guard above + // ensures federationRestConfig is non-nil when enableManagementControllers is + // true; the nil check here is a defensive belt-and-suspenders guard. + if enableManagementControllers && federationRestConfig != nil { + extra, err := setupManagementControllers(mgr, federationClient) + if err != nil { + setupLog.Error(err, "unable to set up management controllers") + os.Exit(1) + } + runnables = append(runnables, extra...) } if serverConfig.WebhookServer != nil { @@ -223,11 +355,6 @@ func main() { }) } - setupLog.Info("starting cluster discovery provider") - g.Go(func() error { - return ignoreCanceled(provider.Run(ctx, mgr)) - }) - setupLog.Info("starting multicluster manager") g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) @@ -239,51 +366,33 @@ func main() { } } -type runnableProvider interface { - multicluster.Provider - Run(context.Context, mcmanager.Manager) error -} - -// Needed until we contribute the patch in the following PR again (need to sign CLA): -// -// See: https://github.com/kubernetes-sigs/multicluster-runtime/pull/18 -type wrappedSingleClusterProvider struct { - multicluster.Provider - cluster cluster.Cluster -} - -func (p *wrappedSingleClusterProvider) Run(ctx context.Context, mgr mcmanager.Manager) error { - if err := mgr.Engage(ctx, "single", p.cluster); err != nil { - return err - } - return p.Provider.(runnableProvider).Run(ctx, mgr) -} - func initializeClusterDiscovery( serverConfig config.WorkloadOperator, deploymentCluster cluster.Cluster, scheme *runtime.Scheme, -) (runnables []manager.Runnable, provider runnableProvider, err error) { +) (runnables []manager.Runnable, provider multicluster.Provider, edgeClusterName string, err error) { runnables = append(runnables, deploymentCluster) switch serverConfig.Discovery.Mode { case multiclusterproviders.ProviderSingle: - provider = &wrappedSingleClusterProvider{ - Provider: mcsingle.New("single", deploymentCluster), - cluster: deploymentCluster, + provider = mcsingle.New(multicluster.ClusterName(singleClusterName), deploymentCluster) + edgeClusterName = serverConfig.Discovery.ClusterName + if edgeClusterName == "" { + edgeClusterName = singleClusterName } case multiclusterproviders.ProviderMilo: discoveryRestConfig, err := serverConfig.Discovery.DiscoveryRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get discovery rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get discovery rest config: %w", err) } projectRestConfig, err := serverConfig.Discovery.ProjectRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get project rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get project rest config: %w", err) } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, @@ -291,7 +400,7 @@ func initializeClusterDiscovery( }, }) if err != nil { - return nil, nil, fmt.Errorf("unable to set up overall controller manager: %w", err) + return nil, nil, "", fmt.Errorf("unable to set up overall controller manager: %w", err) } provider, err = milomulticluster.New(discoveryManager, milomulticluster.Options{ @@ -304,10 +413,11 @@ func initializeClusterDiscovery( ProjectRestConfig: projectRestConfig, }) if err != nil { - return nil, nil, fmt.Errorf("unable to create datum project provider: %w", err) + return nil, nil, "", fmt.Errorf("unable to create datum project provider: %w", err) } runnables = append(runnables, discoveryManager) + edgeClusterName = serverConfig.Discovery.ClusterName // case providers.ProviderKind: // provider = mckind.New(mckind.Options{ @@ -319,13 +429,29 @@ func initializeClusterDiscovery( // }) default: - return nil, nil, fmt.Errorf( + return nil, nil, "", fmt.Errorf( "unsupported cluster discovery mode %s", serverConfig.Discovery.Mode, ) } - return runnables, provider, nil + return runnables, provider, edgeClusterName, nil +} + +func loadServerConfig(path string) (config.WorkloadOperator, error) { + var serverConfig config.WorkloadOperator + var configData []byte + if len(path) > 0 { + var err error + configData, err = os.ReadFile(path) + if err != nil { + return serverConfig, fmt.Errorf("unable to read server config from %q: %w", path, err) + } + } + if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { + return serverConfig, fmt.Errorf("unable to decode server config: %w", err) + } + return serverConfig, nil } func ignoreCanceled(err error) error { @@ -334,3 +460,102 @@ func ignoreCanceled(err error) error { } return err } + +// setupManagementControllers wires the WorkloadDeploymentFederator and +// InstanceProjector onto mgr. It returns any additional Runnable objects that +// must be started alongside the main manager (the federation manager used by +// InstanceProjector). Called only when management controllers are enabled and +// a federation REST config is available. +func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + federator := &controller.WorkloadDeploymentFederator{FederationClient: federationClient} + if err := federator.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentFederator: %w", err) + } + + // InstanceProjector runs in the management plane, watches Instances written + // back by POP-cell operators to the Karmada federation control plane, and + // projects them into the corresponding project namespaces via the multicluster manager. + federationMgr, err := manager.New(federationRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + return nil, fmt.Errorf("federation manager for InstanceProjector: %w", err) + } + if err = (&controller.InstanceProjector{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("InstanceProjector: %w", err) + } + + return []manager.Runnable{federationMgr}, nil +} + +// singleModeProjectID returns an InstanceProjectIDFunc for single-cell mode. +// It reads the upstream-cluster-name label on the edge namespace (e.g. +// "cluster-datum-cloud") and decodes it to the project ID ("datum-cloud"). +// This is the inverse of the "cluster-" encoding used by NSO's +// MappedNamespaceResourceStrategy when stamping cluster-scoped namespace labels. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectID(mgr mcmanager.Manager) controller.InstanceProjectIDFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + encoded := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encoded == "" { + setupLog.Info("singleModeProjectID: upstream-cluster-name label missing", + "namespace", inst.Namespace) + return "", nil + } + projectID := strings.TrimPrefix(encoded, "cluster-") + return strings.ReplaceAll(projectID, "_", "/"), nil + } +} + +// singleModeProjectNamespace returns an InstanceProjectNamespaceFunc for +// single-cell mode. It reads the upstream-namespace label on the edge namespace +// (e.g. "ns-efdf8ca1-...") to find the in-project namespace ("default") where +// ResourceClaims must be created in the project control plane. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectNamespace(mgr mcmanager.Manager) controller.InstanceProjectNamespaceFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + projectNS := ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNS == "" { + setupLog.Info("singleModeProjectNamespace: upstream-namespace label missing", + "namespace", inst.Namespace) + return "", nil + } + return projectNS, nil + } +} + +// readEdgeNamespace reads the edge namespace object via the uncached APIReader +// (no informer started, no cache sync required) with a short deadline. +// Returns a transient error on API failures so callers can requeue with backoff. +func readEdgeNamespace( + ctx context.Context, + mgr mcmanager.Manager, + clusterName multicluster.ClusterName, + namespace string, +) (corev1.Namespace, error) { + cl, err := mgr.GetCluster(ctx, clusterName) + if err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: getting cluster %q: %w", clusterName, err) + } + var ns corev1.Namespace + getCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := cl.GetAPIReader().Get(getCtx, client.ObjectKey{Name: namespace}, &ns); err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: reading namespace %q: %w", namespace, err) + } + return ns, nil +} diff --git a/config/base/certmanager/certificate.yaml b/config/base/certmanager/certificate.yaml deleted file mode 100644 index 3b15b5b3..00000000 --- a/config/base/certmanager/certificate.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: selfsigned-issuer -spec: - selfSigned: {} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-serving-cert -spec: - # The Service name and namespace get substituted in by kustomize - # replacements in the consuming overlay. - dnsNames: - - SERVICE_NAME.SERVICE_NAMESPACE.svc - - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local - issuerRef: - kind: Issuer - name: selfsigned-issuer - secretName: compute-webhook-cert diff --git a/config/base/certmanager/kustomization.yaml b/config/base/certmanager/kustomization.yaml deleted file mode 100644 index bebea5a5..00000000 --- a/config/base/certmanager/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -resources: -- certificate.yaml - -configurations: -- kustomizeconfig.yaml diff --git a/config/base/certmanager/kustomizeconfig.yaml b/config/base/certmanager/kustomizeconfig.yaml deleted file mode 100644 index cf6f89e8..00000000 --- a/config/base/certmanager/kustomizeconfig.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# This configuration is for teaching kustomize how to update name ref substitution -nameReference: -- kind: Issuer - group: cert-manager.io - fieldSpecs: - - kind: Certificate - group: cert-manager.io - path: spec/issuerRef/name diff --git a/config/base/crd/bases/compute.datumapis.com_instances.yaml b/config/base/crd/bases/compute.datumapis.com_instances.yaml index 8c86fb90..a007c0d7 100644 --- a/config/base/crd/bases/compute.datumapis.com_instances.yaml +++ b/config/base/crd/bases/compute.datumapis.com_instances.yaml @@ -35,6 +35,10 @@ spec: name: Message priority: 1 type: string + - jsonPath: .status.conditions[?(@.type=="QuotaGranted")].reason + name: Quota + priority: 1 + type: string name: v1alpha schema: openAPIV3Schema: @@ -262,6 +266,28 @@ spec: description: A list of containers to run within the sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -272,8 +298,9 @@ spec: present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -332,6 +359,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume mount + containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests @@ -823,12 +887,17 @@ spec: message: Waiting for controller reason: Pending status: Unknown - type: Running + type: Available - lastTransitionTime: "1970-01-01T00:00:00Z" message: Waiting for controller reason: Pending status: Unknown type: Ready + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for quota evaluation + reason: PendingEvaluation + status: Unknown + type: QuotaGranted description: Status defines the current state of an Instance. properties: conditions: diff --git a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml index 50c9458b..48a2501d 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml @@ -375,6 +375,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -385,8 +407,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -448,6 +471,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/crd/bases/compute.datumapis.com_workloads.yaml b/config/base/crd/bases/compute.datumapis.com_workloads.yaml index edae1e1c..c452910f 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloads.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloads.yaml @@ -385,6 +385,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -395,8 +417,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -458,6 +481,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 00000000..4c4dbe44 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 00000000..1937ef02 --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 00000000..1261dac6 --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index 03028177..8ef18135 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,33 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --federation-kubeconfig=$(FEDERATION_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + - --feature-gates=$(FEATURE_GATES) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: FEDERATION_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" + - name: FEATURE_GATES + value: "" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -66,20 +85,9 @@ spec: volumeMounts: - name: config mountPath: /config - - name: webhook-cert - mountPath: /tmp/k8s-webhook-server/serving-certs - readOnly: true - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config configMap: name: compute-config - # Optional so the manager can run without admission webhooks: when - # `webhookServer:` is omitted from the server config, the binary - # skips the webhook server entirely and the missing Secret is fine. - - name: webhook-cert - secret: - secretName: compute-webhook-cert - defaultMode: 420 - optional: true diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711deb..cc6bd6cc 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 00000000..3f32da3b --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d974..ada1a1de 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index 5d803d2c..e8721899 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,13 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list - apiGroups: - compute.datumapis.com resources: @@ -36,3 +43,36 @@ rules: - get - patch - update +- apiGroups: + - networking.datumapis.com + resources: + - locations + - networkcontexts + - subnets + verbs: + - get + - list + - watch +- apiGroups: + - networking.datumapis.com + resources: + - networkbindings + - subnetclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - quota.miloapis.com + resources: + - resourceclaims + verbs: + - create + - delete + - get + - list + - watch diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3f..2f3e2676 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/csi-webhook-cert/kustomization.yaml b/config/components/csi-webhook-cert/kustomization.yaml new file mode 100644 index 00000000..feade65a --- /dev/null +++ b/config/components/csi-webhook-cert/kustomization.yaml @@ -0,0 +1,32 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + # Add the CSI webhook cert volume and volumeMount to the manager Deployment. + # The issuer (csi.cert-manager.io/issuer-kind and csi.cert-manager.io/issuer-name) + # must be patched in by the consuming overlay or infra repo. + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: webhook-server-tls + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true + volumes: + - name: webhook-server-tls + csi: + driver: csi.cert-manager.io + readOnly: true + volumeAttributes: + csi.cert-manager.io/fs-group: "65532" + csi.cert-manager.io/dns-names: compute-webhook.compute-system.svc,compute-webhook.compute-system.svc.cluster.local diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 00000000..3ba207ff --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 00000000..2743a63b --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,28 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if statusItems[1].status ~= nil then + desiredObj.status = statusItems[1].status + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe9996..d6783c07 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 00000000..d1e29e7f --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/config/components/quota-credentials/kustomization.yaml b/config/components/quota-credentials/kustomization.yaml new file mode 100644 index 00000000..ffc9a6d8 --- /dev/null +++ b/config/components/quota-credentials/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: quota-credentials + mountPath: /etc/quota-credentials + readOnly: true + volumes: + - name: quota-credentials + secret: + secretName: compute-quota-credentials + optional: true diff --git a/config/components/service-catalog/service-configuration.yaml b/config/components/service-catalog/service-configuration.yaml index 202ac8af..8c29a50e 100644 --- a/config/components/service-catalog/service-configuration.yaml +++ b/config/components/service-catalog/service-configuration.yaml @@ -6,6 +6,9 @@ spec: serviceRef: name: compute phase: Published + locations: + supportedClasses: + - datum-managed monitoredResourceTypes: - type: compute.datumapis.com/Instance displayName: Compute Instance @@ -44,6 +47,26 @@ spec: description: Seconds the instance has been in a running state. kind: Cumulative unit: s + - name: compute.datumapis.com/workloads + displayName: Compute Workloads + description: Number of compute workloads. + kind: Gauge + unit: '{workload}' + - name: compute.datumapis.com/instances + displayName: Compute Instances + description: Number of compute instances. + kind: Gauge + unit: '{instance}' + - name: compute.datumapis.com/vcpus + displayName: Compute vCPUs + description: Number of vCPUs allocated across all instances. + kind: Gauge + unit: '{millicore}' + - name: compute.datumapis.com/memory + displayName: Compute Memory + description: Memory allocated across all instances. + kind: Gauge + unit: MiB billing: consumerDestinations: - monitoredResourceType: compute.datumapis.com/Instance @@ -53,13 +76,13 @@ spec: - compute.datumapis.com/instance/cpu-allocated - compute.datumapis.com/instance/memory-allocated - compute.datumapis.com/instance/uptime-seconds + quota: metricRules: - selector: apiGroup: compute.datumapis.com kind: Workload metricCosts: compute.datumapis.com/workloads: 1 - quota: limits: - name: compute-workloads metric: compute.datumapis.com/workloads diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml new file mode 100644 index 00000000..85b57f09 --- /dev/null +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + discovery: + quotaKubeconfigPath: /etc/quota-credentials/kubeconfig diff --git a/config/overlays/cell/kustomization.yaml b/config/overlays/cell/kustomization.yaml new file mode 100644 index 00000000..80925ee2 --- /dev/null +++ b/config/overlays/cell/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/cell-controllers + - ../../components/quota-credentials + +patches: +- path: disable_webhook_patch.yaml diff --git a/config/overlays/dev/config.yaml b/config/overlays/dev/config.yaml index 1d49a6c6..6ef2f00e 100644 --- a/config/overlays/dev/config.yaml +++ b/config/overlays/dev/config.yaml @@ -2,9 +2,4 @@ apiVersion: apiserver.config.datumapis.com/v1alpha1 kind: WorkloadOperator metricsServer: bindAddress: "0" - -webhookServer: - tls: - secretRef: - name: compute-webhook-cert - namespace: kube-system +webhookServer: {} diff --git a/config/overlays/dev/kustomization.yaml b/config/overlays/dev/kustomization.yaml index 7b076890..339cee0f 100644 --- a/config/overlays/dev/kustomization.yaml +++ b/config/overlays/dev/kustomization.yaml @@ -1,55 +1,29 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: compute-system + resources: - ../../base/crd - ../../base/webhook - - ../../base/certmanager + - webhook-cert.yaml -replacements: - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.namespace - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true +patches: + # Wire cainjector to the dev cert so the API server can verify the webhook. + - patch: |- + apiVersion: admissionregistration.k8s.io/v1 + kind: MutatingWebhookConfiguration + metadata: + name: compute-mutating + annotations: + cert-manager.io/inject-ca-from: compute-system/compute-serving-cert + - patch: |- + apiVersion: admissionregistration.k8s.io/v1 + kind: ValidatingWebhookConfiguration + metadata: + name: compute-validating + annotations: + cert-manager.io/inject-ca-from: compute-system/compute-serving-cert transformers: - webhook_patch.yaml diff --git a/config/overlays/dev/webhook-cert.yaml b/config/overlays/dev/webhook-cert.yaml new file mode 100644 index 00000000..db7bf928 --- /dev/null +++ b/config/overlays/dev/webhook-cert.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: compute-serving-cert +spec: + dnsNames: + - host.docker.internal + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: compute-webhook-cert diff --git a/config/overlays/dev/webhook_patch.yaml b/config/overlays/dev/webhook_patch.yaml index 846649e3..bb302318 100644 --- a/config/overlays/dev/webhook_patch.yaml +++ b/config/overlays/dev/webhook_patch.yaml @@ -1,23 +1,6 @@ --- apiVersion: builtin kind: PatchTransformer -metadata: - name: webhook-cert-patch -patch: |- - - op: replace - path: /spec/dnsNames - value: ["host.docker.internal"] - - op: replace - path: /spec/secretName - value: compute-webhook-cert -target: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert ---- -apiVersion: builtin -kind: PatchTransformer metadata: name: mutatingwebhook-url-patch patch: |- diff --git a/config/overlays/management-plane/discovery_mode_patch.yaml b/config/overlays/management-plane/discovery_mode_patch.yaml new file mode 100644 index 00000000..97bf762c --- /dev/null +++ b/config/overlays/management-plane/discovery_mode_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + webhookServer: {} + discovery: + mode: milo diff --git a/config/overlays/management-plane/downstream_kubeconfig_patch.yaml b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml new file mode 100644 index 00000000..7b3b764b --- /dev/null +++ b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: FEDERATION_KUBECONFIG + value: /etc/kubernetes/downstream/auth/downstream-kubeconfig.yaml + volumeMounts: + - name: downstream-kubeconfig + mountPath: /etc/kubernetes/downstream/auth + readOnly: true + - name: karmada-token + mountPath: /etc/kubernetes/karmada-token + readOnly: true + volumes: + - name: downstream-kubeconfig + configMap: + name: compute-downstream-kubeconfig + - name: karmada-token + projected: + sources: + - serviceAccountToken: + audience: https://karmada-apiserver.karmada-system.svc.cluster.local:5443 + path: token diff --git a/config/overlays/management-plane/kustomization.yaml b/config/overlays/management-plane/kustomization.yaml new file mode 100644 index 00000000..dae13c58 --- /dev/null +++ b/config/overlays/management-plane/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager + - ../../base/webhook +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/resource-metrics + - ../../components/high-availability + - ../../components/management-controllers + - ../../components/csi-webhook-cert + +patches: +- path: downstream_kubeconfig_patch.yaml +- path: discovery_mode_patch.yaml diff --git a/config/overlays/single-cluster/kustomization.yaml b/config/overlays/single-cluster/kustomization.yaml index 160b894d..4d72934e 100644 --- a/config/overlays/single-cluster/kustomization.yaml +++ b/config/overlays/single-cluster/kustomization.yaml @@ -9,100 +9,11 @@ resources: - ../../base/crd - ../../base/manager - ../../base/webhook - - ../../base/certmanager components: - ../../components/leader_election - ../../components/controller_rbac - ../../components/resource-metrics - ../../components/high-availability - -patches: -- path: webhookcainjection_patch.yaml - -replacements: -# Fill in SERVICE_NAME / SERVICE_NAMESPACE placeholders in the Certificate's -# dnsNames so the cert is issued for the actual webhook Service location. -- source: - kind: Service - version: v1 - name: compute-webhook - fieldPath: .metadata.name - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 0 - create: true -- source: - kind: Service - version: v1 - name: compute-webhook - fieldPath: .metadata.namespace - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 1 - create: true -# Wire the Certificate namespace + name into the cert-manager.io/inject-ca-from -# annotation on the webhook configurations so cainjector populates caBundle. -- source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.namespace - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true -- source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true + - ../../components/csi-webhook-cert + - ../../components/management-controllers + - ../../components/cell-controllers diff --git a/config/overlays/single-cluster/webhookcainjection_patch.yaml b/config/overlays/single-cluster/webhookcainjection_patch.yaml deleted file mode 100644 index 41718fb7..00000000 --- a/config/overlays/single-cluster/webhookcainjection_patch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-mutating - annotations: - cert-manager.io/inject-ca-from: system/compute-serving-cert ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-validating - annotations: - cert-manager.io/inject-ca-from: system/compute-serving-cert diff --git a/go.mod b/go.mod index 19fc0103..e1b056c9 100644 --- a/go.mod +++ b/go.mod @@ -1,108 +1,120 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.25.8 require ( + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.37.0 + github.com/karmada-io/api v1.17.0 + github.com/onsi/ginkgo/v2 v2.27.2 + github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 + github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 - go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 - golang.org/x/crypto v0.39.0 - golang.org/x/sync v0.16.0 + go.datum.net/datumctl v0.14.1-0.20260523153711-b44de1c715c1 + go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 + go.miloapis.com/milo v0.26.1-0.20260527023322-a78982bd81f2 + go.miloapis.com/service-catalog v0.0.0-20260529025310-809b6c6e4d91 + golang.org/x/crypto v0.49.0 + golang.org/x/sync v0.20.0 + golang.org/x/term v0.43.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 - k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.2.1 - sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 + k8s.io/api v0.35.3 + k8s.io/apimachinery v0.35.3 + k8s.io/client-go v0.35.3 + k8s.io/component-base v0.35.3 + k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 + sigs.k8s.io/controller-runtime v0.23.3 + sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c + sigs.k8s.io/multicluster-runtime v0.23.3 + sigs.k8s.io/yaml v1.6.0 ) require ( - cel.dev/expr v0.19.1 // indirect - github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + cel.dev/expr v0.25.1 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect - github.com/go-openapi/jsonpointer v0.21.1 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.1 // indirect + github.com/go-openapi/jsonpointer v0.22.4 // indirect + github.com/go-openapi/jsonreference v0.21.4 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.27.0 // indirect + github.com/google/gnostic-models v0.7.1 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/mailru/easyjson v0.9.0 // indirect + github.com/klauspost/compress v1.18.3 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.64.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/x448/float16 v0.8.4 // indirect - go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/otel v1.40.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.40.0 // indirect + go.opentelemetry.io/otel/sdk v1.40.0 // indirect + go.opentelemetry.io/otel/trace v1.40.0 // indirect + go.opentelemetry.io/proto/otlp v1.9.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.41.0 // indirect - golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect - golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.33.0 // indirect + go.uber.org/zap v1.27.1 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 // indirect + golang.org/x/mod v0.35.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect + golang.org/x/sys v0.44.0 // indirect + golang.org/x/text v0.35.0 // indirect + golang.org/x/time v0.15.0 // indirect + golang.org/x/tools v0.43.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.71.1 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260311181403-84a4fc48630c // indirect + google.golang.org/grpc v1.79.3 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.35.3 // indirect + k8s.io/apiserver v0.35.3 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect + k8s.io/kube-openapi v0.0.0-20260330154417-16be699c7b31 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect ) + +replace go.miloapis.com/milo => go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 diff --git a/go.sum b/go.sum index c472bd8b..aa261aba 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,9 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= -github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -15,18 +17,24 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -34,25 +42,52 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= -github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= -github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= +github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4= +github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80= +github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8= +github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/cel-go v0.27.0 h1:e7ih85+4qVrBuqQWTW4FKSqZYokVuc3HnhH5keboFTo= +github.com/google/cel-go v0.27.0/go.mod h1:tTJ11FWqnhw5KKpnWpvW9CJC3Y9GK4EIS0WXnBbebzw= +github.com/google/gnostic-models v0.7.1 h1:SisTfuFKJSKM5CPZkffwi6coztzzeYUhc3v4yxLWH8c= +github.com/google/gnostic-models v0.7.1/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,227 +97,187 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/karmada-io/api v1.17.0 h1:pBA4r6MwKoHxv0ZaE8R0XM53xULkFLX6rKiQeD7avh0= +github.com/karmada-io/api v1.17.0/go.mod h1:gXdGOj7A7R+vcoHFFmHcMN/BomXfFCJCpqNFMFj11d0= +github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw= +github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= -github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= -github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= +github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= -go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.datum.net/datumctl v0.14.1-0.20260523153711-b44de1c715c1 h1:C+VX+/mGJDZjjohBbdJ/PL0qRWqArpxk+wzGe2KpEC8= +go.datum.net/datumctl v0.14.1-0.20260523153711-b44de1c715c1/go.mod h1:rwu8XWb0FeMzX8vCu+UxKLw89DAkyLOh70PNbDaotac= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 h1:P3dePA6cCXKimZzE6d7Xxpj2rz54BxOHI8K8ic7VQ+c= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359/go.mod h1:Nr0PsCodkTW31vWVxR9dhAP9w0y+WHUYeyrcRnchcIE= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 h1:LSHyqLt/jus6iEMvo8pc731L+PyrTHP2bqfMMtHPSWc= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42/go.mod h1:p9O2kk194mvoL8rhqjwb+LWB+GIyY4vQqiTowwibVWo= +go.miloapis.com/service-catalog v0.0.0-20260529025310-809b6c6e4d91 h1:fEvsK12btRZOwjvd5Ps7syNwrwo7EknKgq+hVyK/6ck= +go.miloapis.com/service-catalog v0.0.0-20260529025310-809b6c6e4d91/go.mod h1:znOMOYlmNfQmIvS/7ZpaI909DtLfKEvFe5QK9CgO8GE= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 h1:m639+BofXTvcY1q8CGs4ItwQarYtJPOWmVobfM1HpVI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0/go.mod h1:LjReUci/F4BUyv+y4dwnq3h/26iNOeC3wAIqgvTIZVo= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= +go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= -go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= +go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= +golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= +golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= +golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7 h1:41r6JMbpzBMen0R/4TZeeAmGXSJC7DftGINUodzTkPI= +google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:EIQZ5bFCfRQDV4MhRle7+OgjNtZ6P1PiZBgAKuxXu/Y= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260311181403-84a4fc48630c h1:xgCzyF2LFIO/0X2UAoVRiXKU5Xg6VjToG4i2/ecSswk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260311181403-84a4fc48630c/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/api v0.35.3 h1:pA2fiBc6+N9PDf7SAiluKGEBuScsTzd2uYBkA5RzNWQ= +k8s.io/api v0.35.3/go.mod h1:9Y9tkBcFwKNq2sxwZTQh1Njh9qHl81D0As56tu42GA4= +k8s.io/apiextensions-apiserver v0.35.3 h1:2fQUhEO7P17sijylbdwt0nBdXP0TvHrHj0KeqHD8FiU= +k8s.io/apiextensions-apiserver v0.35.3/go.mod h1:tK4Kz58ykRpwAEkXUb634HD1ZAegEElktz/B3jgETd8= +k8s.io/apimachinery v0.35.3 h1:MeaUwQCV3tjKP4bcwWGgZ/cp/vpsRnQzqO6J6tJyoF8= +k8s.io/apimachinery v0.35.3/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apiserver v0.35.3 h1:D2eIcfJ05hEAEewoSDg+05e0aSRwx8Y4Agvd/wiomUI= +k8s.io/apiserver v0.35.3/go.mod h1:JI0n9bHYzSgIxgIrfe21dbduJ9NHzKJ6RchcsmIKWKY= +k8s.io/client-go v0.35.3 h1:s1lZbpN4uI6IxeTM2cpdtrwHcSOBML1ODNTCCfsP1pg= +k8s.io/client-go v0.35.3/go.mod h1:RzoXkc0mzpWIDvBrRnD+VlfXP+lRzqQjCmKtiwZ8Q9c= +k8s.io/component-base v0.35.3 h1:mbKbzoIMy7JDWS/wqZobYW1JDVRn/RKRaoMQHP9c4P0= +k8s.io/component-base v0.35.3/go.mod h1:IZ8LEG30kPN4Et5NeC7vjNv5aU73ku5MS15iZyvyMYk= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20260330154417-16be699c7b31 h1:V+sn9a/1fEYDGwnllCmqXBk8x7obZ+hl869Q3Abumkg= +k8s.io/kube-openapi v0.0.0-20260330154417-16be699c7b31/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 h1:kBawHLSnx/mYHmRnNUf9d4CpjREbeZuxoSGOX/J+aYM= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= -sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 h1:Pq69tTKfN8ADw8m8A3wUtP8wJ9SPQbbOsgapm3BZEPw= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8/go.mod h1:CpBzLMLQKdm+UCchd2FiGPiDdCxM5dgCCPKuaQ6Fsv0= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c h1:GS4VnGRV90GEUjrgQ2GT5ii6yzWj3KtgUg+sVMdhs5c= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/multicluster-runtime v0.23.3 h1:vrzlXRzHTDsjspUAfoW2rCtr0agoI4q20p9x4Fz4png= +sigs.k8s.io/multicluster-runtime v0.23.3/go.mod h1:r/UA4GHgFoXCcR4tcvlZz7SiLx3l1kJKDuBAhILNIHs= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/hack/e2e/kind-control-plane.yaml b/hack/e2e/kind-control-plane.yaml new file mode 100644 index 00000000..47f3c63b --- /dev/null +++ b/hack/e2e/kind-control-plane.yaml @@ -0,0 +1,17 @@ +# Kind cluster configuration for the compute-control-plane management cluster. +# +# extraPortMappings exposes port 32443 on the macOS host so that the Karmada +# API server NodePort service (nodePort: 32443) is accessible at +# https://localhost:32443 without any additional port-forwarding. +# +# This matches KARMADA_API_NODEPORT in Taskfile.yaml. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 32443 # Karmada API server NodePort + hostPort: 32443 + protocol: TCP + listenAddress: "127.0.0.1" diff --git a/hack/e2e/make-internal-kubeconfig.sh b/hack/e2e/make-internal-kubeconfig.sh new file mode 100755 index 00000000..3303a5bd --- /dev/null +++ b/hack/e2e/make-internal-kubeconfig.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# make-internal-kubeconfig.sh +# +# Produces a kubeconfig variant that uses the Kind node's Docker container IP +# instead of localhost. This variant is stored in Karmada so the controller +# manager (running inside Docker) can reach member cluster API servers across +# the kind bridge network. +# +# Background: Kind maps each cluster's API server to a random localhost port +# on the developer machine. Inside Docker containers, "localhost" refers to the +# container's own loopback — not the host. We therefore swap the server address +# to the Kind control-plane container's Docker bridge IP (e.g. 172.18.0.x) and +# set insecure-skip-tls-verify because the node certificate does not include +# the Docker bridge IP in its SANs. +# +# Usage: +# hack/e2e/make-internal-kubeconfig.sh \ +# tmp/e2e/kubeconfigs/pop-dfw.yaml \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml \ +# compute-pop-dfw + +set -euo pipefail + +INPUT="${1:?usage: $0 }" +OUTPUT="${2:?usage: $0 }" +CLUSTER_NAME="${3:?usage: $0 }" + +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" + +# Resolve the container's Docker bridge IP. +DOCKER_IP=$(docker inspect \ + -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ + "${CONTAINER_NAME}" 2>/dev/null || true) + +if [ -z "${DOCKER_IP}" ]; then + echo "ERROR: Could not resolve Docker IP for container '${CONTAINER_NAME}'." >&2 + echo " Is the Kind cluster '${CLUSTER_NAME}' running?" >&2 + exit 1 +fi + +echo " ${CLUSTER_NAME}: Docker IP ${DOCKER_IP} → ${OUTPUT}" + +python3 - "${INPUT}" "${OUTPUT}" "${DOCKER_IP}" <<'PYEOF' +import sys, yaml + +src, dst, docker_ip = sys.argv[1], sys.argv[2], sys.argv[3] + +with open(src) as f: + cfg = yaml.safe_load(f) + +for cluster in cfg.get('clusters', []): + # Kind API server always listens on port 6443 inside the container. + cluster['cluster']['server'] = f'https://{docker_ip}:6443' + # The node cert only covers localhost / 127.0.0.1, not the bridge IP. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + +with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) +PYEOF diff --git a/hack/e2e/patch-cluster-secret.sh b/hack/e2e/patch-cluster-secret.sh new file mode 100755 index 00000000..e29ed383 --- /dev/null +++ b/hack/e2e/patch-cluster-secret.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# patch-cluster-secret.sh +# +# After "karmadactl join", Karmada stores the member cluster's kubeconfig in a +# Secret referenced by the Cluster object's spec.secretRef, and sets +# spec.apiEndpoint to the localhost address it resolved from the external +# kubeconfig. The Karmada controller manager runs inside Docker and cannot use +# localhost to reach POP cell API servers. +# +# This script: +# 1. Replaces the kubeconfig in the Secret with the Docker-IP variant so that +# the Karmada controller can make API calls to the member cluster. +# 2. Patches spec.apiEndpoint on the Cluster object so that health checks also +# use the Docker bridge IP instead of localhost. +# +# Usage: +# hack/e2e/patch-cluster-secret.sh \ +# tmp/e2e/kubeconfigs/karmada.yaml \ +# compute-pop-dfw \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml + +set -euo pipefail + +KARMADA_KUBECONFIG="${1:?usage: $0 }" +CLUSTER_NAME="${2:?usage: $0 }" +INTERNAL_KUBECONFIG="${3:?usage: $0 }" + +# ------------------------------------------------------------------ +# Read the Cluster object's secretRef (name + namespace) +# ------------------------------------------------------------------ +SECRET_NAME=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.name}' 2>/dev/null || true) + +if [ -z "${SECRET_NAME}" ]; then + echo "ERROR: Could not find spec.secretRef.name on cluster '${CLUSTER_NAME}'." >&2 + echo " Has karmadactl join completed successfully?" >&2 + exit 1 +fi + +SECRET_NAMESPACE=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.namespace}' 2>/dev/null || true) + +SECRET_NAMESPACE="${SECRET_NAMESPACE:-karmada-system}" + +echo " Patching secret ${SECRET_NAMESPACE}/${SECRET_NAME} with Docker-IP kubeconfig..." + +# ------------------------------------------------------------------ +# Replace the kubeconfig data in the secret +# ------------------------------------------------------------------ +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + create secret generic "${SECRET_NAME}" \ + --namespace="${SECRET_NAMESPACE}" \ + --from-file=kubeconfig="${INTERNAL_KUBECONFIG}" \ + --dry-run=client -o yaml \ + | kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + apply -f - + +echo " Secret ${SECRET_NAMESPACE}/${SECRET_NAME} updated — Karmada controller will use Docker bridge IP" + +# ------------------------------------------------------------------ +# Extract the Docker-IP server URL from the internal kubeconfig and +# patch spec.apiEndpoint on the Cluster object so that Karmada's +# cluster-status controller uses the same reachable address for health +# checks. Without this patch the controller continues to probe the +# localhost address stored by karmadactl join and the cluster never +# transitions to Ready. +# ------------------------------------------------------------------ +DOCKER_SERVER=$(kubectl \ + --kubeconfig="${INTERNAL_KUBECONFIG}" \ + config view --minify -o jsonpath='{.clusters[0].cluster.server}') + +if [ -z "${DOCKER_SERVER}" ]; then + echo "ERROR: Could not read server URL from ${INTERNAL_KUBECONFIG}" >&2 + exit 1 +fi + +echo " Patching spec.apiEndpoint on cluster '${CLUSTER_NAME}' → ${DOCKER_SERVER}..." +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + patch cluster "${CLUSTER_NAME}" \ + --type=merge \ + -p "{\"spec\":{\"apiEndpoint\":\"${DOCKER_SERVER}\"}}" + +echo " Cluster '${CLUSTER_NAME}' patched — health checks will now use Docker bridge IP" diff --git a/internal/cmd/compute/deploy/deploy.go b/internal/cmd/compute/deploy/deploy.go new file mode 100644 index 00000000..2e84851a --- /dev/null +++ b/internal/cmd/compute/deploy/deploy.go @@ -0,0 +1,425 @@ +package deploy + +import ( + "bufio" + "bytes" + "context" + "fmt" + "os" + "os/signal" + "strings" + + "github.com/spf13/cobra" + "golang.org/x/term" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilyaml "k8s.io/apimachinery/pkg/util/yaml" + sigsyaml "sigs.k8s.io/yaml" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" + "go.datum.net/compute/internal/cmd/compute/watch" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type options struct { + image string + instanceType string + cities []string + min int32 + port int32 + file string + yes bool +} + +func Command() *cobra.Command { + opts := &options{} + + cmd := &cobra.Command{ + Use: "deploy [workload-name]", + Short: "Deploy or update a workload", + Long: `Deploy a container image as a workload across one or more cities. + +If no arguments are given, an interactive prompt guides you through the deployment. +Use -f to apply a workload manifest file instead of flags.`, + Args: cobra.MaximumNArgs(1), + Example: ` # Deploy with flags + datumctl compute deploy api --image=ghcr.io/acme/api:1.4.2 --city=DFW,IAD --min=2 --port=8080 + + # Interactive mode + datumctl compute deploy + + # Manifest-driven + datumctl compute deploy -f workload.yaml`, + RunE: func(cmd *cobra.Command, args []string) error { + return runDeploy(cmd, args, opts) + }, + ValidArgsFunction: util.CompleteWorkloadNamesAndFlags, + } + + cmd.Flags().StringVar(&opts.image, "image", "", "Container image to deploy (e.g. ghcr.io/acme/api:1.4.2)") + cmd.Flags().StringVar(&opts.instanceType, "instance-type", "datumcloud/d1-standard-2", "Instance type (e.g. datumcloud/d1-standard-2)") + cmd.Flags().StringSliceVar(&opts.cities, "city", nil, "One or more city codes to deploy to (e.g. DFW,IAD)") + cmd.Flags().Int32Var(&opts.min, "min", 1, "Minimum number of instances per city") + cmd.Flags().Int32Var(&opts.port, "port", 0, "Port to expose on the workload (optional)") + cmd.Flags().StringVarP(&opts.file, "file", "f", "", "Path to a workload manifest file") + cmd.Flags().BoolVarP(&opts.yes, "yes", "y", false, "Skip confirmation prompts") + + return cmd +} + +func runDeploy(cmd *cobra.Command, args []string, opts *options) error { + // Determine path. + if opts.file != "" { + return deployFromFile(cmd, opts) + } + + if len(args) > 0 && opts.image != "" { + return deployFromFlags(cmd, args[0], opts) + } + + return fmt.Errorf("workload name and --image are required, or use -f to specify a manifest file") +} + +// deployFromFlags implements Path A: deploy a workload using CLI flags. +func deployFromFlags(cmd *cobra.Command, workloadName string, opts *options) error { + project := util.ProjectFromCmd(cmd) + if project == "" { + return fmt.Errorf("no project set — pass --project or run 'datumctl config set project '") + } + if opts.image == "" { + return fmt.Errorf("--image is required") + } + if len(opts.cities) == 0 { + return fmt.Errorf("--city is required (e.g. --city=DFW,IAD)") + } + instanceType := opts.instanceType + if instanceType == "" { + instanceType = "datumcloud/d1-standard-2" + } + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + out := cmd.OutOrStdout() + + if err := ensureNetwork(ctx, cmd, c, "default", project, opts); err != nil { + return err + } + + fmt.Fprintf(out, "Resolving workload %q in project %s...\n", workloadName, project) + + var workload computev1alpha.Workload + creating := false + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &workload); err != nil { + if k8serrors.IsNotFound(err) { + creating = true + workload = computev1alpha.Workload{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: util.ResourceNamespace, + Name: workloadName, + }, + } + } else { + return fmt.Errorf("getting workload: %w", err) + } + } + + // Build spec. + tcp := corev1.ProtocolTCP + container := computev1alpha.SandboxContainer{ + Name: "app", + Image: opts.image, + } + if opts.port > 0 { + container.Ports = []computev1alpha.NamedPort{ + {Name: "http", Port: opts.port, Protocol: &tcp}, + } + } + + // All cities go into one "default" placement. + placement := computev1alpha.WorkloadPlacement{ + Name: "default", + CityCodes: opts.cities, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: opts.min, + InstanceManagementPolicy: computev1alpha.OrderedReadyInstanceManagementPolicyType, + }, + } + + workload.Spec = computev1alpha.WorkloadSpec{ + Template: computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceType, + }, + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{container}, + }, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{ + { + // TODO: "default" network name is a convention; confirm with platform team. + Network: networkingv1alpha.NetworkRef{Name: "default"}, + }, + }, + }, + }, + Placements: []computev1alpha.WorkloadPlacement{placement}, + } + + fmt.Fprintf(out, " Placement \"default\": cities=[%s], min=%d\n", + strings.Join(opts.cities, ", "), opts.min) + + // Prompt unless --yes or non-interactive. + if !opts.yes && term.IsTerminal(int(os.Stdin.Fd())) { + _, _ = fmt.Fprint(out, "Apply? (Y/n): ") + line, err := bufio.NewReader(os.Stdin).ReadString('\n') + if err != nil { + return fmt.Errorf("reading confirmation: %w", err) + } + line = strings.TrimSpace(line) + if line == "n" || line == "N" { + _, _ = fmt.Fprintln(out, "Aborted.") + return nil + } + } + + if creating { + workload.Namespace = util.ResourceNamespace + if err := c.Create(ctx, &workload); err != nil { + return fmt.Errorf("creating workload: %w", err) + } + fmt.Fprintf(out, " workload/%s created\n", workloadName) + } else { + if err := c.Update(ctx, &workload); err != nil { + return fmt.Errorf("updating workload: %w", err) + } + fmt.Fprintf(out, " workload/%s updated\n", workloadName) + } + + // Save workload.yaml. + if err := saveWorkloadYAML(workloadName, &workload); err != nil { + fmt.Fprintf(out, " warning: could not save workload.yaml: %v\n", err) + } else { + _, _ = fmt.Fprintln(out, "Saved workload.yaml") + } + + fmt.Fprintf(out, "Waiting for rollout. Ctrl-C to detach (rollout continues in background).\n\n") + + watchCtx, cancel := signal.NotifyContext(cmd.Context(), os.Interrupt) + defer cancel() + return watch.Rollout(watchCtx, c, out, project, workload.UID) +} + +// deployFromFile implements Path C: deploy from a manifest file. +func deployFromFile(cmd *cobra.Command, opts *options) error { + project := util.ProjectFromCmd(cmd) + if project == "" { + return fmt.Errorf("no project set — pass --project or run 'datumctl config set project '") + } + + data, err := os.ReadFile(opts.file) + if err != nil { + return fmt.Errorf("reading manifest: %w", err) + } + + var workload computev1alpha.Workload + decoder := utilyaml.NewYAMLOrJSONDecoder(bytes.NewReader(data), 4096) + if err := decoder.Decode(&workload); err != nil { + return fmt.Errorf("decoding manifest: %w", err) + } + + workload.Namespace = util.ResourceNamespace + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + out := cmd.OutOrStdout() + + for _, iface := range workload.Spec.Template.Spec.NetworkInterfaces { + if err := ensureNetwork(ctx, cmd, c, iface.Network.Name, project, opts); err != nil { + return err + } + } + + var existing computev1alpha.Workload + creating := false + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workload.Name}, &existing); err != nil { + if k8serrors.IsNotFound(err) { + creating = true + } else { + return fmt.Errorf("getting workload: %w", err) + } + } + + var diffLines []string + if !creating { + diffLines = manifestDiff(existing, workload) + for _, l := range diffLines { + _, _ = fmt.Fprintln(out, l) + } + if len(diffLines) == 0 { + _, _ = fmt.Fprintln(out, "No changes detected.") + } + } + + // Prompt unless --yes or non-interactive. + if !opts.yes && term.IsTerminal(int(os.Stdin.Fd())) { + _, _ = fmt.Fprint(out, "Apply? (Y/n): ") + line, err := bufio.NewReader(os.Stdin).ReadString('\n') + if err != nil { + return fmt.Errorf("reading confirmation: %w", err) + } + line = strings.TrimSpace(line) + if line == "n" || line == "N" { + _, _ = fmt.Fprintln(out, "Aborted.") + return nil + } + } + + if creating { + if err := c.Create(ctx, &workload); err != nil { + return fmt.Errorf("creating workload: %w", err) + } + fmt.Fprintf(out, " workload/%s created\n", workload.Name) + } else { + workload.ResourceVersion = existing.ResourceVersion + if err := c.Update(ctx, &workload); err != nil { + return fmt.Errorf("updating workload: %w", err) + } + fmt.Fprintf(out, " workload/%s updated\n", workload.Name) + } + + fmt.Fprintf(out, "Waiting for rollout. Ctrl-C to detach (rollout continues in background).\n\n") + + watchCtx, cancel := signal.NotifyContext(cmd.Context(), os.Interrupt) + defer cancel() + return watch.Rollout(watchCtx, c, out, project, workload.UID) +} + +// saveWorkloadYAML marshals the workload and writes it to workload.yaml in the +// current directory. +func saveWorkloadYAML(_ string, workload *computev1alpha.Workload) error { + workload.TypeMeta = metav1.TypeMeta{ + APIVersion: "compute.datumapis.com/v1alpha", + Kind: "Workload", + } + + data, err := sigsyaml.Marshal(workload) + if err != nil { + return fmt.Errorf("marshalling workload: %w", err) + } + + header := "# Managed by datumctl compute deploy. Commit this file to manage your workload declaratively.\n" + + "# Apply changes with: datumctl compute deploy -f workload.yaml\n" + + return os.WriteFile("workload.yaml", append([]byte(header), data...), 0o644) +} + +// imageFromWorkload returns the first container image found in a workload, or empty string. +func imageFromWorkload(w computev1alpha.Workload) string { + sb := w.Spec.Template.Spec.Runtime.Sandbox + if sb != nil && len(sb.Containers) > 0 { + return sb.Containers[0].Image + } + return "" +} + +// ensureNetwork checks if the named network exists and, if not, offers to create it. +// It creates a minimal auto-IPAM IPv4 network on behalf of the user. +func ensureNetwork(ctx context.Context, cmd *cobra.Command, c client.Client, networkName, project string, opts *options) error { + var network networkingv1alpha.Network + err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: networkName}, &network) + if err == nil { + return nil + } + if !k8serrors.IsNotFound(err) { + return fmt.Errorf("checking network %q: %w", networkName, err) + } + + out := cmd.OutOrStdout() + fmt.Fprintf(out, " Network %q does not exist in project %s.\n", networkName, project) + + if !opts.yes && term.IsTerminal(int(os.Stdin.Fd())) { + fmt.Fprintf(out, " Create it now? (Y/n): ") + line, readErr := bufio.NewReader(os.Stdin).ReadString('\n') + if readErr != nil { + return fmt.Errorf("reading confirmation: %w", readErr) + } + line = strings.TrimSpace(line) + if line == "n" || line == "N" { + return fmt.Errorf("network %q is required — create it with: datumctl apply -f network.yaml --project %s", networkName, project) + } + } else if !opts.yes { + return fmt.Errorf("network %q not found in project %s — use --yes to auto-create or create it first", networkName, project) + } + + ipv4Mode := networkingv1alpha.NetworkIPAMModeAuto + newNetwork := networkingv1alpha.Network{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: util.ResourceNamespace, + Name: networkName, + }, + Spec: networkingv1alpha.NetworkSpec{ + IPAM: networkingv1alpha.NetworkIPAM{ + Mode: ipv4Mode, + }, + }, + } + if err := c.Create(ctx, &newNetwork); err != nil { + return fmt.Errorf("creating network %q: %w", networkName, err) + } + fmt.Fprintf(out, " network/%s created\n", networkName) + return nil +} + +// manifestDiff computes diff lines between an existing and desired workload. +func manifestDiff(existing, desired computev1alpha.Workload) []string { + var lines []string + + oldImage := imageFromWorkload(existing) + newImage := imageFromWorkload(desired) + if oldImage != newImage { + lines = append(lines, fmt.Sprintf(" image: %s → %s", oldImage, newImage)) + } + + // Compare placements by name. + oldPlacements := make(map[string]computev1alpha.WorkloadPlacement) + for _, p := range existing.Spec.Placements { + oldPlacements[p.Name] = p + } + newPlacements := make(map[string]computev1alpha.WorkloadPlacement) + for _, p := range desired.Spec.Placements { + newPlacements[p.Name] = p + } + + for name, np := range newPlacements { + if op, ok := oldPlacements[name]; ok { + if op.ScaleSettings.MinReplicas != np.ScaleSettings.MinReplicas { + lines = append(lines, fmt.Sprintf(" placement %q min replicas: %d → %d", + name, op.ScaleSettings.MinReplicas, np.ScaleSettings.MinReplicas)) + } + } else { + lines = append(lines, fmt.Sprintf(" + new placement %q: cities=[%s]", + name, strings.Join(np.CityCodes, ", "))) + } + } + for name := range oldPlacements { + if _, ok := newPlacements[name]; !ok { + lines = append(lines, fmt.Sprintf(" - removed placement %q", name)) + } + } + + return lines +} diff --git a/internal/cmd/compute/destroy/destroy.go b/internal/cmd/compute/destroy/destroy.go new file mode 100644 index 00000000..e95e02c3 --- /dev/null +++ b/internal/cmd/compute/destroy/destroy.go @@ -0,0 +1,92 @@ +package destroy + +import ( + "bufio" + "context" + "fmt" + "os" + "strings" + + "github.com/spf13/cobra" + "golang.org/x/term" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +func Command() *cobra.Command { + var yes bool + + cmd := &cobra.Command{ + Use: "destroy ", + Short: "Delete a workload and all its instances", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return runDestroy(cmd, args, yes) + }, + ValidArgsFunction: util.CompleteWorkloadNames, + } + + cmd.Flags().BoolVarP(&yes, "yes", "y", false, "Skip confirmation prompt") + + return cmd +} + +func runDestroy(cmd *cobra.Command, args []string, yes bool) error { + project := util.ProjectFromCmd(cmd) + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + workloadName := args[0] + + var workload computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &workload); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found in project %s", workloadName, project) + } + return fmt.Errorf("getting workload: %w", err) + } + + // Summarize placements. + var allCityCodes []string + var totalMin int32 + for _, p := range workload.Spec.Placements { + allCityCodes = append(allCityCodes, p.CityCodes...) + totalMin += p.ScaleSettings.MinReplicas + } + + out := cmd.OutOrStdout() + fmt.Fprintf(out, "Workload: %s\nPlacements: %d Cities: %s\nMin replicas: %d\n\n", + workloadName, + len(workload.Spec.Placements), + strings.Join(allCityCodes, ", "), + totalMin, + ) + + // Prompt unless --yes or non-interactive. + if !yes && term.IsTerminal(int(os.Stdin.Fd())) { + _, _ = fmt.Fprint(out, "This will delete workload and all its instances. Continue? (y/N): ") + line, err := bufio.NewReader(os.Stdin).ReadString('\n') + if err != nil { + return fmt.Errorf("reading confirmation: %w", err) + } + line = strings.TrimSpace(line) + if line != "y" && line != "Y" { + _, _ = fmt.Fprintln(out, "Aborted.") + return nil + } + } + + if err := c.Delete(ctx, &workload); err != nil { + return fmt.Errorf("deleting workload: %w", err) + } + + fmt.Fprintf(out, "workload/%s deleted.\n", workloadName) + return nil +} diff --git a/internal/cmd/compute/instances/instances.go b/internal/cmd/compute/instances/instances.go new file mode 100644 index 00000000..0cf897c8 --- /dev/null +++ b/internal/cmd/compute/instances/instances.go @@ -0,0 +1,504 @@ +package instances + +import ( + "context" + "fmt" + "sort" + "strings" + "unicode" + + "github.com/spf13/cobra" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +type listOptions struct { + workload string + city string +} + +func Command() *cobra.Command { + opts := &listOptions{} + + cmd := &cobra.Command{ + Use: "instances", + Short: "List or inspect workload instances", + Long: `List all running instances in the project, optionally filtered by workload. +Use the describe subcommand for full details on a single instance.`, + Example: ` # List all instances + datumctl compute instances + + # Filter by workload + datumctl compute instances --workload=api + + # Filter by city + datumctl compute instances --city=DFW + + # Machine-readable output + datumctl compute instances -o json + + # Describe a single instance + datumctl compute instances describe api-dfw-0`, + RunE: func(cmd *cobra.Command, args []string) error { + return runList(cmd, opts) + }, + } + + cmd.Flags().StringVar(&opts.workload, "workload", "", "Filter instances to a specific workload") + cmd.Flags().StringVar(&opts.city, "city", "", "Filter instances to a specific city") + cmd.Flags().StringP("output", "o", "table", "Output format: table, wide, json, yaml") + cmd.Flags().Bool("no-headers", false, "Omit the table header row (table and wide only)") + + _ = cmd.RegisterFlagCompletionFunc("workload", util.CompleteWorkloadNames) + _ = cmd.RegisterFlagCompletionFunc("city", util.CompleteCityCodes) + _ = cmd.RegisterFlagCompletionFunc("output", util.CompleteOutputFormats("table", "wide", "json", "yaml")) + + cmd.AddCommand(describeCommand()) + + return cmd +} + +type instanceRow struct { + name string + workload string + city string + externalIP string + internalIP string + runtimeKind string // "sandbox" or "vm" + instType string + age string + status string +} + +func runList(cmd *cobra.Command, opts *listOptions) error { + ctx := context.Background() + project := util.ProjectFromCmd(cmd) + outputFlag, _ := cmd.Flags().GetString("output") + noHeaders, _ := cmd.Flags().GetBool("no-headers") + + c, err := util.NewClient(project) + if err != nil { + return err + } + + // Optionally resolve workload UID. + var workloadUID string + if opts.workload != "" { + var wl computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: opts.workload}, &wl); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found", opts.workload) + } + return fmt.Errorf("getting workload: %w", err) + } + workloadUID = string(wl.UID) + } + + // List instances. + var instList computev1alpha.InstanceList + listOpts := []client.ListOption{client.InNamespace(util.ResourceNamespace)} + if workloadUID != "" { + selector := labels.SelectorFromSet(labels.Set{computev1alpha.WorkloadUIDLabel: workloadUID}) + listOpts = append(listOpts, client.MatchingLabelsSelector{Selector: selector}) + } + if err := c.List(ctx, &instList, listOpts...); err != nil { + return fmt.Errorf("listing instances: %w", err) + } + + // JSON/YAML: emit raw API resource and return early (before city filter). + switch util.OutputFormat(outputFlag) { + case util.OutputJSON: + return util.PrintJSON(cmd.OutOrStdout(), &instList) + case util.OutputYAML: + return util.PrintYAML(cmd.OutOrStdout(), &instList) + } + + // List deployments — build map deploymentName → *WorkloadDeployment. + // Keyed by name (not UID) because the WorkloadDeploymentUIDLabel on an + // Instance carries the edge/Karmada WD UID, which differs from the + // project-cluster WD UID. The WD name is identical across all planes. + var deployList computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &deployList, client.InNamespace(util.ResourceNamespace)); err != nil { + return fmt.Errorf("listing deployments: %w", err) + } + deploymentMap := make(map[string]*computev1alpha.WorkloadDeployment, len(deployList.Items)) + for i := range deployList.Items { + d := &deployList.Items[i] + deploymentMap[d.Name] = d + } + + // List workloads — build map workloadUID → name. + var wlList computev1alpha.WorkloadList + if err := c.List(ctx, &wlList, client.InNamespace(util.ResourceNamespace)); err != nil { + return fmt.Errorf("listing workloads: %w", err) + } + workloadMap := make(map[string]string, len(wlList.Items)) + for _, wl := range wlList.Items { + workloadMap[string(wl.UID)] = wl.Name + } + + // Build rows. + var rows []instanceRow + for _, inst := range instList.Items { + wlUID := inst.Labels[computev1alpha.WorkloadUIDLabel] + + city := "unknown" + wlName := workloadMap[wlUID] + if wlName == "" { + wlName = "orphaned" + } + + // Prefer self-describing labels stamped at creation time (fast path — + // no join needed). Fall back to the WorkloadDeployment join for older + // instances that predate the labels. + labelCity := inst.Labels[computev1alpha.CityCodeLabel] + labelWLName := inst.Labels[computev1alpha.WorkloadNameLabel] + + if labelCity != "" && labelWLName != "" { + // Both labels present: no join needed. + city = labelCity + wlName = labelWLName + } else { + // At least one label absent — fall back to WorkloadDeployment lookup. + // Prefer the explicit WorkloadDeploymentNameLabel; fall back to + // deriving the WD name from the Instance name for existing instances + // that predate the label. + depName := inst.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if depName == "" { + depName = wdNameFromInstanceName(inst.Name) + } + if dep, ok := deploymentMap[depName]; ok { + if labelCity != "" { + city = labelCity + } else { + city = dep.Spec.CityCode + } + if labelWLName != "" { + wlName = labelWLName + } else if dep.Spec.WorkloadRef.Name != "" { + wlName = dep.Spec.WorkloadRef.Name + } + } else { + // Deployment not found — use whatever labels we do have. + if labelCity != "" { + city = labelCity + } + if labelWLName != "" { + wlName = labelWLName + } + } + } + + // Client-side city filter. + if opts.city != "" && city != opts.city { + continue + } + + extIP := "" + intIP := "" + if len(inst.Status.NetworkInterfaces) > 0 { + ni := inst.Status.NetworkInterfaces[0] + if ni.Assignments.ExternalIP != nil { + extIP = *ni.Assignments.ExternalIP + } + if ni.Assignments.NetworkIP != nil { + intIP = *ni.Assignments.NetworkIP + } + } + + runtimeKind := "vm" + if inst.Spec.Runtime.Sandbox != nil { + runtimeKind = "sandbox" + } + + rows = append(rows, instanceRow{ + name: inst.Name, + workload: wlName, + city: city, + externalIP: extIP, + internalIP: intIP, + runtimeKind: runtimeKind, + instType: inst.Spec.Runtime.Resources.InstanceType, + age: util.RelativeAge(inst.CreationTimestamp), + status: util.InstanceStatus(inst.Status.Conditions), + }) + } + + // Sort: workload ASC, city ASC, name ASC. + sort.Slice(rows, func(i, j int) bool { + if rows[i].workload != rows[j].workload { + return rows[i].workload < rows[j].workload + } + if rows[i].city != rows[j].city { + return rows[i].city < rows[j].city + } + return rows[i].name < rows[j].name + }) + + if len(rows) == 0 { + fmt.Fprintf(cmd.OutOrStdout(), "No instances found in project %s.\n", project) + return nil + } + + out := cmd.OutOrStdout() + wide := util.OutputFormat(outputFlag) == util.OutputWide + tw := util.NewTabWriter(out) + if !noHeaders { + if wide { + _, _ = fmt.Fprintf(tw, "NAME\tWORKLOAD\tCITY\tEXTERNAL IP\tINTERNAL IP\tTYPE\tAGE\tSTATUS\tINSTANCE TYPE\n") + } else { + _, _ = fmt.Fprintf(tw, "NAME\tWORKLOAD\tCITY\tEXTERNAL IP\tINTERNAL IP\tTYPE\tAGE\tSTATUS\n") + } + } + for _, r := range rows { + if wide { + _, _ = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + r.name, r.workload, r.city, r.externalIP, r.internalIP, r.runtimeKind, r.age, r.status, r.instType) + } else { + _, _ = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + r.name, r.workload, r.city, r.externalIP, r.internalIP, r.runtimeKind, r.age, r.status) + } + } + _ = tw.Flush() + + var available, pending, failed int + for _, r := range rows { + switch { + case r.status == "Available": + available++ + case strings.HasPrefix(r.status, "Failed"): + failed++ + default: + pending++ + } + } + _, _ = fmt.Fprintf(out, "\n%d instances — %d Available, %d Pending, %d Failed\n", len(rows), available, pending, failed) + + return nil +} + +func describeCommand() *cobra.Command { + return &cobra.Command{ + Use: "describe ", + Short: "Show full details for a single instance", + Long: `Display runtime configuration, network status, and current conditions for an +instance, including plain-English explanations of any failure states.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return runDescribe(cmd, args) + }, + ValidArgsFunction: util.CompleteInstanceNames, + } +} + +func runDescribe(cmd *cobra.Command, args []string) error { + ctx := context.Background() + project := util.ProjectFromCmd(cmd) + + c, err := util.NewClient(project) + if err != nil { + return err + } + + instanceName := args[0] + + var inst computev1alpha.Instance + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: instanceName}, &inst); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("instance %q not found in project %s", instanceName, project) + } + return fmt.Errorf("getting instance: %w", err) + } + + // Resolve CITY, WORKLOAD, and PLACEMENT. Prefer self-describing labels + // stamped at creation time (no join needed). Fall back to a + // WorkloadDeployment Get when any of the labels are absent, so that older + // instances that predate the stamp still resolve correctly. + workloadName := "orphaned" + city := "unknown" + placementName := "" + + labelCity := inst.Labels[computev1alpha.CityCodeLabel] + labelWLName := inst.Labels[computev1alpha.WorkloadNameLabel] + labelPlacement := inst.Labels[computev1alpha.PlacementNameLabel] + + if labelCity != "" && labelWLName != "" && labelPlacement != "" { + // All three labels present: no join needed. + city = labelCity + workloadName = labelWLName + placementName = labelPlacement + } else { + // At least one label absent — fall back to WorkloadDeployment Get. + // Prefer the WorkloadDeploymentNameLabel; fall back to deriving the WD + // name from the Instance name for existing instances that lack the label. + depName := inst.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if depName == "" { + depName = wdNameFromInstanceName(inst.Name) + } + if depName != "" { + var dep computev1alpha.WorkloadDeployment + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: depName}, &dep); err == nil { + if labelCity != "" { + city = labelCity + } else { + city = dep.Spec.CityCode + } + if labelPlacement != "" { + placementName = labelPlacement + } else { + placementName = dep.Spec.PlacementName + } + if labelWLName != "" { + workloadName = labelWLName + } else { + workloadName = dep.Spec.WorkloadRef.Name + } + } else { + // WD Get failed — use whatever labels we do have. + if labelCity != "" { + city = labelCity + } + if labelWLName != "" { + workloadName = labelWLName + } + if labelPlacement != "" { + placementName = labelPlacement + } + } + } + } + + status, detail := util.InstanceStatusDetail(inst.Status.Conditions) + + out := cmd.OutOrStdout() + + // Key-value header block. + fmt.Fprintf(out, "%-14s %s\n", "Instance", instanceName) + fmt.Fprintf(out, "%-14s %s\n", "Workload", workloadName) + if placementName != "" { + fmt.Fprintf(out, "%-14s %s\n", "Placement", placementName) + } + fmt.Fprintf(out, "%-14s %s\n", "City", city) + fmt.Fprintf(out, "%-14s %s\n", "Age", util.RelativeAgeVerbose(inst.CreationTimestamp)) + fmt.Fprintf(out, "%-14s %s\n", "Status", status) + if detail != "" { + fmt.Fprintf(out, "%-14s %s\n", "", detail) + } + fmt.Fprintf(out, "\n") + + // Runtime section. + fmt.Fprintf(out, "Runtime\n") + if inst.Spec.Runtime.Sandbox != nil { + sb := inst.Spec.Runtime.Sandbox + if len(sb.Containers) > 0 { + ctr := sb.Containers[0] + fmt.Fprintf(out, " %-12s %s\n", "Image:", ctr.Image) + + if len(ctr.Env) > 0 { + var envStrs []string + for _, e := range ctr.Env { + envStrs = append(envStrs, formatEnvVar(e)) + } + fmt.Fprintf(out, " %-12s %s\n", "Env:", strings.Join(envStrs, ", ")) + } + + if len(ctr.Ports) > 0 { + var portStrs []string + for _, p := range ctr.Ports { + proto := "TCP" + if p.Protocol != nil { + proto = string(*p.Protocol) + } + portStrs = append(portStrs, fmt.Sprintf("%d/%s", p.Port, proto)) + } + fmt.Fprintf(out, " %-12s %s\n", "Ports:", strings.Join(portStrs, ", ")) + } + } + fmt.Fprintf(out, " %-12s %s\n", "Type:", inst.Spec.Runtime.Resources.InstanceType) + } else { + fmt.Fprintf(out, " %-12s %s\n", "Type:", "virtual-machine") + fmt.Fprintf(out, " %-12s %s\n", "Instance type:", inst.Spec.Runtime.Resources.InstanceType) + } + fmt.Fprintf(out, "\n") + + // Network block. + fmt.Fprintf(out, "Network\n") + networkLine := networkSummary(inst.Status.NetworkInterfaces) + fmt.Fprintf(out, " %s\n", networkLine) + fmt.Fprintf(out, "\n") + + // Next steps if not available and quota exceeded. + quotaCond := util.FindCondition(inst.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if status != "Available" && quotaCond != nil && quotaCond.Reason == computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded { + fmt.Fprintf(out, "Next steps\n") + fmt.Fprintf(out, " datumctl compute scale %s --min=2\n", workloadName) + fmt.Fprintf(out, " datumctl compute quota\n") + } + + return nil +} + +// networkSummary returns a human-readable network status line. +func networkSummary(ifaces []computev1alpha.InstanceNetworkInterfaceStatus) string { + if len(ifaces) == 0 { + return "Waiting for addresses (not yet scheduled)" + } + ni := ifaces[0] + if ni.Assignments.ExternalIP == nil && ni.Assignments.NetworkIP == nil { + return "Waiting for addresses (not yet scheduled)" + } + extIP := "not assigned" + if ni.Assignments.ExternalIP != nil { + extIP = *ni.Assignments.ExternalIP + } + intIP := "not assigned" + if ni.Assignments.NetworkIP != nil { + intIP = *ni.Assignments.NetworkIP + } + return fmt.Sprintf("External: %s Internal: %s", extIP, intIP) +} + +// wdNameFromInstanceName derives the WorkloadDeployment name from an Instance +// name by stripping the trailing "-" suffix. Instance names follow the +// convention "-" (e.g. "my-api-default-dfw-0" → "my-api-default-dfw"). +// This is used as a fallback when WorkloadDeploymentNameLabel is absent on older +// instances that predate that label. +// +// If the name has no trailing numeric segment (not a standard instance name), +// the original name is returned unchanged so callers can handle it gracefully. +func wdNameFromInstanceName(instanceName string) string { + idx := strings.LastIndex(instanceName, "-") + if idx < 0 { + return instanceName + } + suffix := instanceName[idx+1:] + // The suffix must be entirely numeric digits to qualify as an ordinal. + for _, r := range suffix { + if !unicode.IsDigit(r) { + return instanceName + } + } + if suffix == "" { + return instanceName + } + return instanceName[:idx] +} + +// formatEnvVar renders a single EnvVar for display. +func formatEnvVar(e corev1.EnvVar) string { + if e.ValueFrom != nil { + if e.ValueFrom.SecretKeyRef != nil { + return e.Name + " (from secret)" + } + if e.ValueFrom.ConfigMapKeyRef != nil { + return e.Name + " (from configmap)" + } + } + return e.Name + "=" + e.Value +} diff --git a/internal/cmd/compute/quota/quota.go b/internal/cmd/compute/quota/quota.go new file mode 100644 index 00000000..7091ec27 --- /dev/null +++ b/internal/cmd/compute/quota/quota.go @@ -0,0 +1,126 @@ +package quota + +import ( + "context" + "fmt" + "strings" + + "github.com/spf13/cobra" + + "go.datum.net/compute/internal/cmd/compute/util" +) + +const resourceTypePrefix = "compute.datumapis.com" + +// orderedTypes controls display order in the quota table. +var orderedTypes = []string{ + "compute.datumapis.com/workloads", + "compute.datumapis.com/instances", + "compute.datumapis.com/vcpus", + "compute.datumapis.com/memory", +} + +// computeMeta provides display overrides for compute resource types. +// The live ResourceRegistrations use displayUnit "1" so we supply our own. +var computeMeta = map[string]util.QuotaMeta{ + "compute.datumapis.com/workloads": {DisplayName: "Workloads", Unit: "workloads", Divisor: 1}, + "compute.datumapis.com/instances": {DisplayName: "Instances", Unit: "instances", Divisor: 1}, + "compute.datumapis.com/vcpus": {DisplayName: "vCPUs", Unit: "vCPUs", Divisor: 1000}, + "compute.datumapis.com/memory": {DisplayName: "Memory", Unit: "MiB", Divisor: 1}, +} + +func Command() *cobra.Command { + var constrained bool + + cmd := &cobra.Command{ + Use: "quota", + Short: "Show compute quota for the current project", + RunE: func(cmd *cobra.Command, args []string) error { + return runQuota(cmd, constrained) + }, + } + + cmd.Flags().BoolVar(&constrained, "constrained", false, "Show only resource types that are at their limit") + cmd.Flags().StringP("output", "o", "table", "Output format: table, json, yaml") + + _ = cmd.RegisterFlagCompletionFunc("output", util.CompleteOutputFormats("table", "json", "yaml")) + + return cmd +} + +const barWidth = 20 + +func quotaBar(used, limit int64) string { + if limit <= 0 { + return "[" + strings.Repeat("-", barWidth) + "] N/A" + } + pct := float64(used) / float64(limit) + if pct > 1 { + pct = 1 + } + filled := int(pct * barWidth) + bar := strings.Repeat("#", filled) + strings.Repeat("-", barWidth-filled) + return fmt.Sprintf("[%s] %3.0f%%", bar, pct*100) +} + +func runQuota(cmd *cobra.Command, constrained bool) error { + project := util.ProjectFromCmd(cmd) + outputFlag, _ := cmd.Flags().GetString("output") + + projectClient, err := util.NewClient(project) + if err != nil { + return err + } + + platformClient, err := util.NewPlatformClient() + if err != nil { + return err + } + + ctx := context.Background() + + rows, err := util.ListServiceQuota(ctx, projectClient, platformClient, resourceTypePrefix, computeMeta, orderedTypes) + if err != nil { + return fmt.Errorf("listing quota: %w", err) + } + + if constrained { + filtered := rows[:0] + for _, r := range rows { + if r.Available == 0 { + filtered = append(filtered, r) + } + } + rows = filtered + } + + out := cmd.OutOrStdout() + + switch util.OutputFormat(outputFlag) { + case util.OutputJSON: + return util.PrintJSON(out, rows) + case util.OutputYAML: + return util.PrintYAML(out, rows) + } + + if len(rows) == 0 { + if constrained { + _, _ = fmt.Fprintln(out, "No resource types are at their limit.") + } else { + _, _ = fmt.Fprintln(out, "No quota configured for this project.") + } + return nil + } + + _, _ = fmt.Fprintf(out, "Quota for project %s\n\n", project) + + tw := util.NewTabWriter(out) + _, _ = fmt.Fprintf(tw, "RESOURCE\tUNIT\tLIMIT\tUSED\tAVAILABLE\tUSAGE\n") + for _, r := range rows { + _, _ = fmt.Fprintf(tw, "%s\t%s\t%d\t%d\t%d\t%s\n", + r.DisplayName, r.Unit, r.Limit, r.Used, r.Available, quotaBar(r.Used, r.Limit)) + } + _ = tw.Flush() + + return nil +} diff --git a/internal/cmd/compute/restart/restart.go b/internal/cmd/compute/restart/restart.go new file mode 100644 index 00000000..9fafd618 --- /dev/null +++ b/internal/cmd/compute/restart/restart.go @@ -0,0 +1,115 @@ +package restart + +import ( + "context" + "fmt" + "time" + + "github.com/spf13/cobra" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +func Command() *cobra.Command { + var city string + + cmd := &cobra.Command{ + Use: "restart ", + Short: "Trigger a rolling restart of a workload", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return runRestart(cmd, args, city) + }, + ValidArgsFunction: util.CompleteWorkloadNames, + } + + cmd.Flags().StringVar(&city, "city", "", "Restart only instances in a specific city") + + return cmd +} + +func runRestart(cmd *cobra.Command, args []string, city string) error { + project := util.ProjectFromCmd(cmd) + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + workloadName := args[0] + + var workload computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &workload); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found in project %s", workloadName, project) + } + return fmt.Errorf("getting workload: %w", err) + } + + restartedAt := time.Now().UTC().Format(time.RFC3339) + out := cmd.OutOrStdout() + + if city == "" { + // Restart all placements by annotating the workload template. + if workload.Spec.Template.Annotations == nil { + workload.Spec.Template.Annotations = make(map[string]string) + } + workload.Spec.Template.Annotations[computev1alpha.RestartedAtAnnotation] = restartedAt + + if err := c.Update(ctx, &workload); err != nil { + return fmt.Errorf("updating workload: %w", err) + } + + fmt.Fprintf(out, + "Restarting workload %q — rolling restart initiated.\nRun 'datumctl compute rollout %s' to watch progress.\n", + workloadName, workloadName, + ) + return nil + } + + // Restart only deployments in the given city. + selector := labels.SelectorFromSet(labels.Set{ + computev1alpha.WorkloadUIDLabel: string(workload.UID), + }) + var deployList computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &deployList, + client.InNamespace(util.ResourceNamespace), + client.MatchingLabelsSelector{Selector: selector}, + ); err != nil { + return fmt.Errorf("listing deployments: %w", err) + } + + var matched []computev1alpha.WorkloadDeployment + for _, d := range deployList.Items { + if d.Spec.CityCode == city { + matched = append(matched, d) + } + } + + if len(matched) == 0 { + return fmt.Errorf("no deployment found for workload %q in city %q", workloadName, city) + } + + for i := range matched { + if matched[i].Spec.Template.Annotations == nil { + matched[i].Spec.Template.Annotations = make(map[string]string) + } + matched[i].Spec.Template.Annotations[computev1alpha.RestartedAtAnnotation] = restartedAt + + if err := c.Update(ctx, &matched[i]); err != nil { + return fmt.Errorf("updating deployment in %s: %w", city, err) + } + } + + fmt.Fprintf(out, + "Restarting workload %q in %s — rolling restart initiated.\nRun 'datumctl compute rollout %s' to watch progress.\n", + workloadName, city, workloadName, + ) + return nil +} diff --git a/internal/cmd/compute/rollout/rollout.go b/internal/cmd/compute/rollout/rollout.go new file mode 100644 index 00000000..0a3800aa --- /dev/null +++ b/internal/cmd/compute/rollout/rollout.go @@ -0,0 +1,62 @@ +package rollout + +import ( + "context" + "fmt" + "os" + "os/signal" + + "github.com/spf13/cobra" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" + "go.datum.net/compute/internal/cmd/compute/watch" +) + +func Command() *cobra.Command { + cmd := &cobra.Command{ + Use: "rollout ", + Short: "Watch live rollout progress for a workload", + Long: `Watch the live progress of a rollout across all placements. + +Pressing Ctrl-C detaches from the watch without canceling the rollout.`, + Args: cobra.ExactArgs(1), + Example: ` # Watch live rollout progress + datumctl compute rollout api`, + RunE: func(cmd *cobra.Command, args []string) error { + return runWatch(cmd, args) + }, + ValidArgsFunction: util.CompleteWorkloadNames, + } + + return cmd +} + +func runWatch(cmd *cobra.Command, args []string) error { + project := util.ProjectFromCmd(cmd) + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + workloadName := args[0] + + var workload computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &workload); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found in project %s", workloadName, project) + } + return fmt.Errorf("getting workload: %w", err) + } + + out := cmd.OutOrStdout() + fmt.Fprintf(out, "Rolling workload %q\n", workloadName) + + watchCtx, cancel := signal.NotifyContext(cmd.Context(), os.Interrupt) + defer cancel() + return watch.Rollout(watchCtx, c, out, project, workload.UID) +} diff --git a/internal/cmd/compute/root.go b/internal/cmd/compute/root.go new file mode 100644 index 00000000..7dbc2ccc --- /dev/null +++ b/internal/cmd/compute/root.go @@ -0,0 +1,38 @@ +package compute + +import ( + "github.com/spf13/cobra" + "go.datum.net/datumctl/plugin" + + "go.datum.net/compute/internal/cmd/compute/deploy" + "go.datum.net/compute/internal/cmd/compute/destroy" + "go.datum.net/compute/internal/cmd/compute/instances" + "go.datum.net/compute/internal/cmd/compute/quota" + "go.datum.net/compute/internal/cmd/compute/restart" + "go.datum.net/compute/internal/cmd/compute/rollout" + "go.datum.net/compute/internal/cmd/compute/scale" + "go.datum.net/compute/internal/cmd/compute/util" + "go.datum.net/compute/internal/cmd/compute/workloads" +) + +func Command() *cobra.Command { + root := plugin.NewRootCmd("compute", "Deploy and manage containerized workloads on Datum Cloud") + root.SilenceUsage = true + + root.PersistentPreRunE = func(cmd *cobra.Command, args []string) error { + return util.EnsureComputeEntitlement(cmd.Context(), util.ProjectFromCmd(cmd), cmd.InOrStdin(), cmd.ErrOrStderr()) + } + + root.AddCommand( + deploy.Command(), + destroy.Command(), + instances.Command(), + quota.Command(), + restart.Command(), + rollout.Command(), + scale.Command(), + workloads.Command(), + ) + + return root +} diff --git a/internal/cmd/compute/scale/scale.go b/internal/cmd/compute/scale/scale.go new file mode 100644 index 00000000..1ce704ee --- /dev/null +++ b/internal/cmd/compute/scale/scale.go @@ -0,0 +1,77 @@ +package scale + +import ( + "context" + "fmt" + + "github.com/spf13/cobra" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +func Command() *cobra.Command { + var min int32 + + cmd := &cobra.Command{ + Use: "scale ", + Short: "Adjust the minimum replica count for a workload", + Args: cobra.ExactArgs(1), + Example: ` datumctl compute scale api --min=4`, + RunE: func(cmd *cobra.Command, args []string) error { + return runScale(cmd, args, min) + }, + ValidArgsFunction: util.CompleteWorkloadNames, + } + + cmd.Flags().Int32Var(&min, "min", 0, "Minimum number of instances per city") + _ = cmd.MarkFlagRequired("min") + + return cmd +} + +func runScale(cmd *cobra.Command, args []string, min int32) error { + if min <= 0 { + return fmt.Errorf("min replicas must be at least 1") + } + + project := util.ProjectFromCmd(cmd) + + c, err := util.NewClient(project) + if err != nil { + return err + } + + ctx := context.Background() + workloadName := args[0] + + var workload computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &workload); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found in project %s", workloadName, project) + } + return fmt.Errorf("getting workload: %w", err) + } + + if len(workload.Spec.Placements) == 0 { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "workload has no placements; nothing to scale") + return nil + } + + for i := range workload.Spec.Placements { + workload.Spec.Placements[i].ScaleSettings.MinReplicas = min + } + + if err := c.Update(ctx, &workload); err != nil { + return fmt.Errorf("updating workload: %w", err) + } + + fmt.Fprintf(cmd.OutOrStdout(), + "Scaled workload %q — min replicas set to %d across %d placement(s).\nRun 'datumctl compute rollout %s' to watch progress.\n", + workloadName, min, len(workload.Spec.Placements), workloadName, + ) + + return nil +} diff --git a/internal/cmd/compute/util/client.go b/internal/cmd/compute/util/client.go new file mode 100644 index 00000000..feafc7ab --- /dev/null +++ b/internal/cmd/compute/util/client.go @@ -0,0 +1,97 @@ +package util + +import ( + "fmt" + + "github.com/spf13/cobra" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/datumctl/plugin" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + resourceManagerGroup = "resourcemanager.miloapis.com" + resourceManagerVersion = "v1alpha1" + + // ResourceNamespace is the namespace used for all resource operations within + // a project's virtual control plane. The project slug routes to the right + // control plane; within it, everything lives in "default". + ResourceNamespace = "default" +) + +// ProjectControlPlaneURL returns the virtual control-plane URL for a project. +func ProjectControlPlaneURL(apiHost, projectID string) string { + return fmt.Sprintf("https://%s/apis/%s/%s/projects/%s/control-plane", + apiHost, resourceManagerGroup, resourceManagerVersion, projectID) +} + +// NewClient builds a Kubernetes client targeting the project's virtual control plane. +func NewClient(project string) (client.Client, error) { + if project == "" { + return nil, fmt.Errorf("no project set — pass --project or run 'datumctl config set project '") + } + + ctx := plugin.Context() + if ctx.APIHost == "" { + return nil, fmt.Errorf("DATUM_API_HOST is not set; is this plugin running via datumctl?") + } + + token, err := plugin.Token() + if err != nil { + return nil, fmt.Errorf("getting credentials: %w", err) + } + + scheme := runtime.NewScheme() + if err := computev1alpha.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("registering compute scheme: %w", err) + } + if err := networkingv1alpha.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("registering networking scheme: %w", err) + } + if err := quotav1alpha1.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("registering quota scheme: %w", err) + } + + cfg := &rest.Config{ + Host: ProjectControlPlaneURL(ctx.APIHost, project), + BearerToken: token, + } + + return client.New(cfg, client.Options{Scheme: scheme}) +} + +// NewPlatformClient builds a Kubernetes client targeting the platform API server +// (not a project-scoped virtual control plane). +func NewPlatformClient() (client.Client, error) { + ctx := plugin.Context() + if ctx.APIHost == "" { + return nil, fmt.Errorf("DATUM_API_HOST is not set; is this plugin running via datumctl?") + } + + token, err := plugin.Token() + if err != nil { + return nil, fmt.Errorf("getting credentials: %w", err) + } + + scheme := runtime.NewScheme() + if err := quotav1alpha1.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("registering quota scheme: %w", err) + } + + cfg := &rest.Config{ + Host: "https://" + ctx.APIHost, + BearerToken: token, + } + + return client.New(cfg, client.Options{Scheme: scheme}) +} + +// ProjectFromCmd reads the --project persistent flag from the command's root. +func ProjectFromCmd(cmd *cobra.Command) string { + project, _ := cmd.Root().PersistentFlags().GetString("project") + return project +} diff --git a/internal/cmd/compute/util/completion.go b/internal/cmd/compute/util/completion.go new file mode 100644 index 00000000..c6dcb3fb --- /dev/null +++ b/internal/cmd/compute/util/completion.go @@ -0,0 +1,100 @@ +package util + +import ( + "context" + + "github.com/spf13/cobra" + "sigs.k8s.io/controller-runtime/pkg/client" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/datumctl/plugin" +) + +// CompleteInstanceNames is a ValidArgsFunction that lists instance names from the API. +func CompleteInstanceNames(cmd *cobra.Command, args []string, _ string) ([]string, cobra.ShellCompDirective) { + if len(args) > 0 { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + project := ProjectFromCmd(cmd) + c, err := NewClient(project) + if err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + var list computev1alpha.InstanceList + if err := c.List(context.Background(), &list, client.InNamespace(ResourceNamespace)); err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + names := make([]string, len(list.Items)) + for i, inst := range list.Items { + names[i] = inst.Name + } + return names, cobra.ShellCompDirectiveNoFileComp +} + +// CompleteCityCodes is a ValidArgsFunction that returns unique city codes from +// all WorkloadDeployments in the project. +func CompleteCityCodes(cmd *cobra.Command, _ []string, _ string) ([]string, cobra.ShellCompDirective) { + project := ProjectFromCmd(cmd) + c, err := NewClient(project) + if err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + var list computev1alpha.WorkloadDeploymentList + if err := c.List(context.Background(), &list, client.InNamespace(ResourceNamespace)); err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + seen := make(map[string]bool) + var codes []string + for _, d := range list.Items { + if !seen[d.Spec.CityCode] { + seen[d.Spec.CityCode] = true + codes = append(codes, d.Spec.CityCode) + } + } + return codes, cobra.ShellCompDirectiveNoFileComp +} + +// CompleteOutputFormats returns a ValidArgsFunction that completes -o/--output +// to the given allowed values. +func CompleteOutputFormats(allowed ...string) func(*cobra.Command, []string, string) ([]string, cobra.ShellCompDirective) { + return func(_ *cobra.Command, _ []string, _ string) ([]string, cobra.ShellCompDirective) { + return allowed, cobra.ShellCompDirectiveNoFileComp + } +} + +// CompleteWorkloadNames is a ValidArgsFunction that lists workload names from +// the API. It suppresses file completion in all cases so the shell never falls +// back to filename completion when completing a workload-name argument. +func CompleteWorkloadNames(cmd *cobra.Command, args []string, _ string) ([]string, cobra.ShellCompDirective) { + if len(args) > 0 { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + project := ProjectFromCmd(cmd) + c, err := NewClient(project) + if err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + var list computev1alpha.WorkloadList + if err := c.List(context.Background(), &list, client.InNamespace(ResourceNamespace)); err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + + names := make([]string, len(list.Items)) + for i, w := range list.Items { + names[i] = w.Name + } + return names, cobra.ShellCompDirectiveNoFileComp +} + +// CompleteWorkloadNamesAndFlags lists workload names from the API and also +// surfaces the command's own flags as completions. Used by commands where flags +// are the primary input (e.g. deploy) so that plain offers flags without +// requiring the user to type "--" first. +var CompleteWorkloadNamesAndFlags = plugin.WithFlagCompletion(CompleteWorkloadNames) diff --git a/internal/cmd/compute/util/conditions.go b/internal/cmd/compute/util/conditions.go new file mode 100644 index 00000000..b1c896f8 --- /dev/null +++ b/internal/cmd/compute/util/conditions.go @@ -0,0 +1,168 @@ +package util + +import ( + "fmt" + + v1alpha "go.datum.net/compute/api/v1alpha" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// FindCondition returns the first condition with the given type, or nil. +func FindCondition(conditions []metav1.Condition, condType string) *metav1.Condition { + for i := range conditions { + if conditions[i].Type == condType { + return &conditions[i] + } + } + return nil +} + +// ReadinessBlock reads the named readiness condition and reports whether it is +// blocking (present and not True). When blocked, it returns the machine-readable +// reason and human-readable message from the condition. Callers must not branch +// on specific reason values — display whatever the server emits. +func ReadinessBlock(conditions []metav1.Condition, condType string) (reason, message string, blocked bool) { + c := FindCondition(conditions, condType) + if c == nil || c.Status == metav1.ConditionTrue { + return "", "", false + } + return c.Reason, c.Message, true +} + +// InstanceStatus returns a short user-facing status string for list views. +// It reports availability, not live runtime state — it never indicates +// whether a process is actively running at this instant. +// Priority order: +// +// Ready=True → "Available" +// QuotaGranted=False/QuotaExceeded → "Pending (quota exceeded)" +// QuotaGranted=False/ValidationFailed → "Pending (quota validation failed)" +// QuotaGranted=Unknown/PendingEvaluation → "Pending (quota evaluation)" +// Programmed=False/ImageUnavailable → "Failed (image unavailable)" +// Programmed=False/InstanceCrashing → "Failed (crashing)" +// Programmed=False/ConfigurationError → "Failed (configuration error)" +// Programmed≠True/PendingProgramming or ProgrammingInProgress → "Starting" +// Ready≠True/ → "Pending ()" from server-rolled-up blocking reason +// default → "Pending" +func InstanceStatus(conditions []metav1.Condition) string { + ready := FindCondition(conditions, v1alpha.InstanceReady) + if ready != nil && ready.Status == metav1.ConditionTrue { + return "Available" + } + + quota := FindCondition(conditions, v1alpha.InstanceQuotaGranted) + if quota != nil && quota.Status == metav1.ConditionFalse { + switch quota.Reason { + case v1alpha.InstanceQuotaGrantedReasonQuotaExceeded: + return "Pending (quota exceeded)" + case v1alpha.InstanceQuotaGrantedReasonValidationFailed: + return "Pending (quota validation failed)" + } + } + if quota != nil && quota.Status == metav1.ConditionUnknown { + if quota.Reason == v1alpha.InstanceQuotaGrantedReasonPendingEvaluation { + return "Pending (quota evaluation)" + } + } + + programmed := FindCondition(conditions, v1alpha.InstanceProgrammed) + if programmed != nil && programmed.Status != metav1.ConditionTrue { + switch programmed.Reason { + case v1alpha.InstanceProgrammedReasonImageUnavailable: + return "Failed (image unavailable)" + case v1alpha.InstanceProgrammedReasonInstanceCrashing: + return "Failed (crashing)" + case v1alpha.InstanceProgrammedReasonConfigurationError: + return "Failed (configuration error)" + case v1alpha.InstanceProgrammedReasonPendingProgramming, v1alpha.InstanceProgrammedReasonProgrammingInProgress: + return "Starting" + } + } + + // Use the server-rolled-up blocking reason from the Ready condition. This + // surfaces reasons like SourceNotFound or ReferencedDataNotReady that the + // sub-condition checks above don't cover, without requiring client-side + // knowledge of every reason value. + if reason, _, blocked := ReadinessBlock(conditions, v1alpha.InstanceReady); blocked && reason != "" { + return "Pending (" + reason + ")" + } + + return "Pending" +} + +// InstanceStatusDetail returns a status line and optional detail message for describe views. +// +// Ready=True → "Available", "" +// QuotaGranted=False/QuotaExceeded → "Not available — quota exceeded", condition.Message +// Programmed=False/ImageUnavailable → "Not available — image unavailable", condition.Message +// Programmed=False/InstanceCrashing → "Not available — instance crashing", condition.Message +// Programmed=False/ConfigurationError → "Not available — configuration error", condition.Message +// Programmed≠True/PendingProgramming or ProgrammingInProgress → "Starting", "" +// Ready≠True/ → "Pending — ", message (server-rolled-up blocking reason) +// default → "Pending", "" +func InstanceStatusDetail(conditions []metav1.Condition) (status, detail string) { + ready := FindCondition(conditions, v1alpha.InstanceReady) + if ready != nil && ready.Status == metav1.ConditionTrue { + return "Available", "" + } + + quota := FindCondition(conditions, v1alpha.InstanceQuotaGranted) + if quota != nil && quota.Status == metav1.ConditionFalse && quota.Reason == v1alpha.InstanceQuotaGrantedReasonQuotaExceeded { + return "Not available — quota exceeded", quota.Message + } + + programmed := FindCondition(conditions, v1alpha.InstanceProgrammed) + if programmed != nil && programmed.Status != metav1.ConditionTrue { + switch programmed.Reason { + case v1alpha.InstanceProgrammedReasonImageUnavailable: + return "Not available — image unavailable", programmed.Message + case v1alpha.InstanceProgrammedReasonInstanceCrashing: + return "Not available — instance crashing", programmed.Message + case v1alpha.InstanceProgrammedReasonConfigurationError: + return "Not available — configuration error", programmed.Message + case v1alpha.InstanceProgrammedReasonPendingProgramming, v1alpha.InstanceProgrammedReasonProgrammingInProgress: + return "Starting", "" + } + } + + // Fall back to the server-rolled-up blocking reason on the Ready condition. + // This surfaces reasons like SourceNotFound and ReferencedDataNotReady + // without requiring client-side special-casing of every reason value. + if reason, msg, blocked := ReadinessBlock(conditions, v1alpha.InstanceReady); blocked && reason != "" { + return "Pending — " + reason, msg + } + + return "Pending", "" +} + +// WorkloadHealth derives a one-line health summary from workload Available condition + replica counts. +// +// Available=True, ready==desired → "Available — all placements at desired replicas" +// Available=True, ready → "Unavailable — " (reason from server-rolled-up blocking reason) +// Available=False (no reason) → "Unavailable — no healthy instances" +// Unknown/missing → "Unknown" +func WorkloadHealth(conditions []metav1.Condition, ready, desired int32) string { + avail := FindCondition(conditions, v1alpha.WorkloadAvailable) + if avail == nil || avail.Status == metav1.ConditionUnknown { + return "Unknown" + } + if avail.Status == metav1.ConditionFalse { + if avail.Reason != "" { + return "Unavailable — " + avail.Reason + } + return "Unavailable — no healthy instances" + } + // Available=True + if ready >= desired { + return "Available — all placements at desired replicas" + } + diff := desired - ready + return fmt.Sprintf("Degraded — %d instances below desired count", diff) +} + +// IsAvailable returns true if the instance's Ready condition status is True. +func IsAvailable(conditions []metav1.Condition) bool { + c := FindCondition(conditions, v1alpha.InstanceReady) + return c != nil && c.Status == metav1.ConditionTrue +} diff --git a/internal/cmd/compute/util/conditions_test.go b/internal/cmd/compute/util/conditions_test.go new file mode 100644 index 00000000..0d22b850 --- /dev/null +++ b/internal/cmd/compute/util/conditions_test.go @@ -0,0 +1,386 @@ +package util + +import ( + "testing" + + v1alpha "go.datum.net/compute/api/v1alpha" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func makeCondition(condType, status, reason, message string) metav1.Condition { + return metav1.Condition{ + Type: condType, + Status: metav1.ConditionStatus(status), + Reason: reason, + Message: message, + } +} + +// TestReadinessBlock covers the generic helper that is the heart of the +// status-blocking-reason contract. +func TestReadinessBlock(t *testing.T) { + tests := []struct { + name string + conditions []metav1.Condition + condType string + wantReason string + wantMessage string + wantBlocked bool + }{ + { + name: "condition absent — not blocked", + conditions: nil, + condType: v1alpha.InstanceReady, + wantBlocked: false, + }, + { + name: "condition True — not blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "True", "Available", ""), + }, + condType: v1alpha.InstanceReady, + wantBlocked: false, + }, + { + name: "condition False with reason and message — blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "SourceNotFound", `ConfigMap "app-config" not found in namespace "default"`), + }, + condType: v1alpha.InstanceReady, + wantReason: "SourceNotFound", + wantMessage: `ConfigMap "app-config" not found in namespace "default"`, + wantBlocked: true, + }, + { + name: "condition False with reason only — blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "ReferencedDataNotReady", ""), + }, + condType: v1alpha.InstanceReady, + wantReason: "ReferencedDataNotReady", + wantMessage: "", + wantBlocked: true, + }, + { + name: "condition Unknown — blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "Unknown", "PendingQuota", ""), + }, + condType: v1alpha.InstanceReady, + wantReason: "PendingQuota", + wantBlocked: true, + }, + { + name: "wrong condition type present — not blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", "PendingProgramming", ""), + }, + condType: v1alpha.InstanceReady, + wantBlocked: false, + }, + { + name: "WorkloadDeploymentAvailable False — blocked", + conditions: []metav1.Condition{ + makeCondition(v1alpha.WorkloadDeploymentAvailable, "False", "NetworkProvisioning", "Waiting for network assignment"), + }, + condType: v1alpha.WorkloadDeploymentAvailable, + wantReason: "NetworkProvisioning", + wantMessage: "Waiting for network assignment", + wantBlocked: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reason, msg, blocked := ReadinessBlock(tc.conditions, tc.condType) + if blocked != tc.wantBlocked { + t.Errorf("ReadinessBlock() blocked = %v, want %v", blocked, tc.wantBlocked) + } + if reason != tc.wantReason { + t.Errorf("ReadinessBlock() reason = %q, want %q", reason, tc.wantReason) + } + if msg != tc.wantMessage { + t.Errorf("ReadinessBlock() message = %q, want %q", msg, tc.wantMessage) + } + }) + } +} + +// TestInstanceStatusDetail_BlockingReason verifies that the describe view +// surfaces reason+message from the Ready condition when no specific sub-condition +// check matches — the server-rolled-up blocking reason path. +func TestInstanceStatusDetail_BlockingReason(t *testing.T) { + tests := []struct { + name string + conditions []metav1.Condition + wantStatus string + wantDetail string + }{ + { + name: "no conditions — Pending, no detail", + conditions: nil, + wantStatus: "Pending", + wantDetail: "", + }, + { + name: "Ready True — Available", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "True", "Available", ""), + }, + wantStatus: "Available", + wantDetail: "", + }, + { + name: "Ready False / SourceNotFound — Pending with reason and message", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "SourceNotFound", `ConfigMap "app-config" not found in namespace "default"`), + }, + wantStatus: "Pending — SourceNotFound", + wantDetail: `ConfigMap "app-config" not found in namespace "default"`, + }, + { + name: "Ready False / ReferencedDataNotReady — Pending with reason, no message", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "ReferencedDataNotReady", ""), + }, + wantStatus: "Pending — ReferencedDataNotReady", + wantDetail: "", + }, + { + name: "quota exceeded still uses specific path", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceQuotaGranted, "False", v1alpha.InstanceQuotaGrantedReasonQuotaExceeded, "quota limit reached"), + }, + wantStatus: "Not available — quota exceeded", + wantDetail: "quota limit reached", + }, + { + name: "Programmed Unknown/PendingProgramming — Starting", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonPendingProgramming, ""), + }, + wantStatus: "Starting", + wantDetail: "", + }, + { + name: "Programmed Unknown/ProgrammingInProgress — Starting", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonProgrammingInProgress, ""), + }, + wantStatus: "Starting", + wantDetail: "", + }, + { + name: "Programmed False/ProgrammingInProgress — Starting", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonProgrammingInProgress, ""), + }, + wantStatus: "Starting", + wantDetail: "", + }, + { + name: "Programmed False/ImageUnavailable — not available image unavailable", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonImageUnavailable, "image pull failed: not found"), + }, + wantStatus: "Not available — image unavailable", + wantDetail: "image pull failed: not found", + }, + { + name: "Programmed Unknown/ImageUnavailable — not available image unavailable", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonImageUnavailable, "image pull failed: not found"), + }, + wantStatus: "Not available — image unavailable", + wantDetail: "image pull failed: not found", + }, + { + name: "Programmed False/InstanceCrashing — not available instance crashing", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonInstanceCrashing, "exit code 1"), + }, + wantStatus: "Not available — instance crashing", + wantDetail: "exit code 1", + }, + { + name: "Programmed Unknown/InstanceCrashing — not available instance crashing", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonInstanceCrashing, "exit code 1"), + }, + wantStatus: "Not available — instance crashing", + wantDetail: "exit code 1", + }, + { + name: "Programmed False/ConfigurationError — not available configuration error", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonConfigurationError, "invalid env var name"), + }, + wantStatus: "Not available — configuration error", + wantDetail: "invalid env var name", + }, + { + name: "Programmed Unknown/ConfigurationError — not available configuration error", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonConfigurationError, "invalid env var name"), + }, + wantStatus: "Not available — configuration error", + wantDetail: "invalid env var name", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + status, detail := InstanceStatusDetail(tc.conditions) + if status != tc.wantStatus { + t.Errorf("InstanceStatusDetail() status = %q, want %q", status, tc.wantStatus) + } + if detail != tc.wantDetail { + t.Errorf("InstanceStatusDetail() detail = %q, want %q", detail, tc.wantDetail) + } + }) + } +} + +// TestInstanceStatus_BlockingReason verifies that the list-view short status +// surfaces the server-rolled-up reason from Ready when no sub-condition matches. +func TestInstanceStatus_BlockingReason(t *testing.T) { + tests := []struct { + name string + conditions []metav1.Condition + wantStatus string + }{ + { + name: "no conditions — Pending", + conditions: nil, + wantStatus: "Pending", + }, + { + name: "Ready False / SourceNotFound — Pending (SourceNotFound)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "SourceNotFound", `ConfigMap "app-config" not found`), + }, + wantStatus: "Pending (SourceNotFound)", + }, + { + name: "Ready False / ReferencedDataNotReady — Pending (ReferencedDataNotReady)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceReady, "False", "ReferencedDataNotReady", ""), + }, + wantStatus: "Pending (ReferencedDataNotReady)", + }, + { + name: "Programmed Unknown/PendingProgramming — Starting", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonPendingProgramming, ""), + }, + wantStatus: "Starting", + }, + { + name: "Programmed Unknown/ProgrammingInProgress — Starting", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonProgrammingInProgress, ""), + }, + wantStatus: "Starting", + }, + { + name: "Programmed False/ImageUnavailable — Failed (image unavailable)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonImageUnavailable, "image pull failed: not found"), + }, + wantStatus: "Failed (image unavailable)", + }, + { + name: "Programmed Unknown/ImageUnavailable — Failed (image unavailable)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonImageUnavailable, "image pull failed: not found"), + }, + wantStatus: "Failed (image unavailable)", + }, + { + name: "Programmed False/InstanceCrashing — Failed (crashing)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonInstanceCrashing, "exit code 1"), + }, + wantStatus: "Failed (crashing)", + }, + { + name: "Programmed Unknown/InstanceCrashing — Failed (crashing)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonInstanceCrashing, "exit code 1"), + }, + wantStatus: "Failed (crashing)", + }, + { + name: "Programmed False/ConfigurationError — Failed (configuration error)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "False", v1alpha.InstanceProgrammedReasonConfigurationError, "invalid env var name"), + }, + wantStatus: "Failed (configuration error)", + }, + { + name: "Programmed Unknown/ConfigurationError — Failed (configuration error)", + conditions: []metav1.Condition{ + makeCondition(v1alpha.InstanceProgrammed, "Unknown", v1alpha.InstanceProgrammedReasonConfigurationError, "invalid env var name"), + }, + wantStatus: "Failed (configuration error)", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := InstanceStatus(tc.conditions) + if got != tc.wantStatus { + t.Errorf("InstanceStatus() = %q, want %q", got, tc.wantStatus) + } + }) + } +} + +// TestWorkloadHealth_BlockingReason verifies that Unavailable workloads surface +// the reason from the Available condition rather than a generic message. +func TestWorkloadHealth_BlockingReason(t *testing.T) { + tests := []struct { + name string + conditions []metav1.Condition + ready int32 + desired int32 + wantHealth string + }{ + { + name: "Available False with reason — Unavailable — ", + conditions: []metav1.Condition{ + makeCondition(v1alpha.WorkloadAvailable, "False", "SourceNotFound", `ConfigMap "app-config" not found`), + }, + ready: 0, + desired: 1, + wantHealth: "Unavailable — SourceNotFound", + }, + { + name: "Available False without reason — generic message", + conditions: []metav1.Condition{ + makeCondition(v1alpha.WorkloadAvailable, "False", "", ""), + }, + ready: 0, + desired: 1, + wantHealth: "Unavailable — no healthy instances", + }, + { + name: "Available True all ready", + conditions: []metav1.Condition{ + makeCondition(v1alpha.WorkloadAvailable, "True", "Available", ""), + }, + ready: 2, + desired: 2, + wantHealth: "Available — all placements at desired replicas", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := WorkloadHealth(tc.conditions, tc.ready, tc.desired) + if got != tc.wantHealth { + t.Errorf("WorkloadHealth() = %q, want %q", got, tc.wantHealth) + } + }) + } +} diff --git a/internal/cmd/compute/util/entitlement.go b/internal/cmd/compute/util/entitlement.go new file mode 100644 index 00000000..19641a06 --- /dev/null +++ b/internal/cmd/compute/util/entitlement.go @@ -0,0 +1,190 @@ +package util + +import ( + "bufio" + "context" + "fmt" + "io" + "os" + "strings" + "time" + + servicesv1alpha1 "go.miloapis.com/service-catalog/api/v1alpha1" + "go.datum.net/datumctl/plugin" + "golang.org/x/term" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const computeServiceName = "compute" + +// EnsureComputeEntitlement checks that the selected project has an active +// ServiceEntitlement for the compute service. If none exists, it prompts the +// user (via in/out) to request access. out should be cmd.ErrOrStderr() so the +// prompt does not pollute structured output. +func EnsureComputeEntitlement(ctx context.Context, project string, in io.Reader, out io.Writer) error { + if project == "" { + return nil + } + + wc, err := newEntitlementClient(project) + if err != nil { + return err + } + + var list servicesv1alpha1.ServiceEntitlementList + if err := wc.List(ctx, &list); err != nil { + if apimeta.IsNoMatchError(err) { + // API not installed in this project's VCP — treat as no entitlement. + return promptAndRequestAccess(ctx, project, wc, in, out) + } + return fmt.Errorf("checking service entitlement: %w", err) + } + + for i := range list.Items { + item := &list.Items[i] + if item.Spec.ServiceRef.Name != computeServiceName { + continue + } + switch item.Status.Phase { + case servicesv1alpha1.EntitlementPhaseActive: + return nil + case servicesv1alpha1.EntitlementPhasePendingApproval: + return fmt.Errorf( + "compute service entitlement for project %q is pending approval\n\n"+ + "Check status with: datumctl services list", + project, + ) + case servicesv1alpha1.EntitlementPhaseRejected: + return fmt.Errorf( + "compute service entitlement for project %q was rejected\n\n"+ + "Re-enable with: datumctl services enable %s", + project, computeServiceName, + ) + } + } + + return promptAndRequestAccess(ctx, project, wc, in, out) +} + +func promptAndRequestAccess(ctx context.Context, project string, wc client.WithWatch, in io.Reader, out io.Writer) error { + if !isTTY(in) { + return fmt.Errorf( + "compute service is not enabled for project %q\n\n"+ + "Enable it with: datumctl services enable %s", + project, computeServiceName, + ) + } + + fmt.Fprintf(out, "Compute is not enabled for project %q.\n", project) + fmt.Fprintf(out, "Would you like to request access? [y/N]: ") + + scanner := bufio.NewScanner(in) + if !scanner.Scan() { + return fmt.Errorf("compute service is not enabled for project %q", project) + } + answer := strings.TrimSpace(strings.ToLower(scanner.Text())) + if answer != "y" && answer != "yes" { + return fmt.Errorf( + "compute service is not enabled for project %q\n\n"+ + "Enable it with: datumctl services enable %s", + project, computeServiceName, + ) + } + + fmt.Fprintf(out, "Requesting access to compute for project %q...\n", project) + + entitlement := &servicesv1alpha1.ServiceEntitlement{ + ObjectMeta: metav1.ObjectMeta{Name: computeServiceName}, + Spec: servicesv1alpha1.ServiceEntitlementSpec{ + ServiceRef: servicesv1alpha1.ServiceRef{Name: computeServiceName}, + }, + } + if err := wc.Create(ctx, entitlement); err != nil { + return fmt.Errorf("requesting compute access: %w", err) + } + + // Watch for the Ready condition to appear (set by the reconciler asynchronously). + watchCtx, cancel := context.WithTimeout(ctx, 15*time.Second) + defer cancel() + + watcher, err := wc.Watch(watchCtx, &servicesv1alpha1.ServiceEntitlementList{}) + if err != nil { + return fmt.Errorf("watching entitlement status: %w", err) + } + defer watcher.Stop() + + for { + select { + case <-watchCtx.Done(): + fmt.Fprintf(out, "\nAccess to compute for project %q has been requested.\n", project) + fmt.Fprintf(out, "Run your command again once it becomes active.\n\n") + fmt.Fprintf(out, "Check status with: datumctl services list\n") + return fmt.Errorf("compute access is not yet active — try again in a moment") + + case event, ok := <-watcher.ResultChan(): + if !ok { + return fmt.Errorf("watch channel closed unexpectedly") + } + if event.Type != watch.Modified && event.Type != watch.Added { + continue + } + item, ok := event.Object.(*servicesv1alpha1.ServiceEntitlement) + if !ok || item.Spec.ServiceRef.Name != computeServiceName { + continue + } + if apimeta.FindStatusCondition(item.Status.Conditions, "Ready") == nil { + continue + } + switch item.Status.Phase { + case servicesv1alpha1.EntitlementPhaseActive: + fmt.Fprintf(out, "Compute enabled for project %q.\n\n", project) + return nil + case servicesv1alpha1.EntitlementPhasePendingApproval: + fmt.Fprintf(out, "\nYour request to enable compute for project %q has been submitted,\n", project) + fmt.Fprintf(out, "but it requires approval before you can use the service.\n") + fmt.Fprintf(out, "You will be notified when access is granted.\n\n") + fmt.Fprintf(out, "Check status with: datumctl services list\n") + return fmt.Errorf("compute access is pending approval") + default: + return fmt.Errorf("compute entitlement for project %q entered unexpected phase %q", project, item.Status.Phase) + } + } + } +} + +func newEntitlementClient(project string) (client.WithWatch, error) { + pluginCtx := plugin.Context() + if pluginCtx.APIHost == "" { + return nil, fmt.Errorf("DATUM_API_HOST is not set; is this plugin running via datumctl?") + } + + token, err := plugin.Token() + if err != nil { + return nil, fmt.Errorf("getting credentials: %w", err) + } + + scheme := runtime.NewScheme() + if err := servicesv1alpha1.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("registering services scheme: %w", err) + } + + cfg := &rest.Config{ + Host: ProjectControlPlaneURL(pluginCtx.APIHost, project), + BearerToken: token, + } + + return client.NewWithWatch(cfg, client.Options{Scheme: scheme}) +} + +func isTTY(r io.Reader) bool { + f, ok := r.(*os.File) + if !ok { + return false + } + return term.IsTerminal(int(f.Fd())) +} diff --git a/internal/cmd/compute/util/printer.go b/internal/cmd/compute/util/printer.go new file mode 100644 index 00000000..df82be7f --- /dev/null +++ b/internal/cmd/compute/util/printer.go @@ -0,0 +1,39 @@ +package util + +import ( + "encoding/json" + "fmt" + "io" + + "sigs.k8s.io/yaml" +) + +// OutputFormat represents the requested output format for a command. +type OutputFormat string + +const ( + OutputTable OutputFormat = "table" + OutputWide OutputFormat = "wide" + OutputJSON OutputFormat = "json" + OutputYAML OutputFormat = "yaml" +) + +// PrintJSON serialises obj to JSON and writes it to w. +func PrintJSON(w io.Writer, obj any) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err := enc.Encode(obj); err != nil { + return fmt.Errorf("encoding JSON: %w", err) + } + return nil +} + +// PrintYAML serialises obj to YAML and writes it to w. +func PrintYAML(w io.Writer, obj any) error { + b, err := yaml.Marshal(obj) + if err != nil { + return fmt.Errorf("encoding YAML: %w", err) + } + _, err = w.Write(b) + return err +} diff --git a/internal/cmd/compute/util/quota.go b/internal/cmd/compute/util/quota.go new file mode 100644 index 00000000..8acf99ff --- /dev/null +++ b/internal/cmd/compute/util/quota.go @@ -0,0 +1,164 @@ +package util + +import ( + "context" + "strings" + + quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// QuotaRow holds display-ready quota data for one resource type. +type QuotaRow struct { + ResourceType string `json:"resourceType"` + DisplayName string `json:"displayName"` + Unit string `json:"unit"` + Limit int64 `json:"limit"` + Used int64 `json:"used"` + Available int64 `json:"available"` +} + +// QuotaMeta overrides display metadata for a resource type. When provided, +// DisplayName, Unit, and Divisor take precedence over ResourceRegistration values. +type QuotaMeta struct { + DisplayName string + Unit string + // Divisor converts the stored integer value to display units (e.g. 1000 for + // millicores → vCPUs). Zero is treated as 1. + Divisor int64 + // Order controls the position of this row in the returned slice (ascending). + // Rows without a meta entry sort after all meta rows, alphabetically. + Order int +} + +// ListServiceQuota returns quota rows for AllowanceBuckets whose resource type +// begins with resourceTypePrefix (e.g. "compute.datumapis.com"). projectClient +// must target the project's virtual control plane; platformClient must target +// the platform API server (used to fetch ResourceRegistrations for display +// metadata when no override is provided in meta). +// +// meta may be nil. When an entry exists for a resource type, its DisplayName, +// Unit, and Divisor are used; otherwise the ResourceRegistration's displayUnit +// field is used and the divisor defaults to 1. +func ListServiceQuota( + ctx context.Context, + projectClient, platformClient client.Client, + resourceTypePrefix string, + meta map[string]QuotaMeta, + orderedTypes []string, // explicit display order; types not in this list follow alphabetically +) ([]QuotaRow, error) { + // Fetch AllowanceBuckets from the project VCP. + var bucketList quotav1alpha1.AllowanceBucketList + if err := projectClient.List(ctx, &bucketList, + client.InNamespace("milo-system"), + client.MatchingLabels{"quota.miloapis.com/consumer-kind": "Project"}, + ); err != nil { + return nil, err + } + + // Index buckets by resource type, filtering to the requested prefix. + bucketByType := make(map[string]*quotav1alpha1.AllowanceBucket) + for i := range bucketList.Items { + b := &bucketList.Items[i] + if strings.HasPrefix(b.Spec.ResourceType, resourceTypePrefix) { + bucketByType[b.Spec.ResourceType] = b + } + } + + if len(bucketByType) == 0 { + return nil, nil + } + + // Fetch ResourceRegistrations from the platform for display metadata fallback. + rrByType := make(map[string]*quotav1alpha1.ResourceRegistration) + if platformClient != nil { + var rrList quotav1alpha1.ResourceRegistrationList + if err := platformClient.List(ctx, &rrList); err == nil { + for i := range rrList.Items { + rr := &rrList.Items[i] + if strings.HasPrefix(rr.Spec.ResourceType, resourceTypePrefix) { + rrByType[rr.Spec.ResourceType] = rr + } + } + } + } + + // Build an ordered index: position in orderedTypes slice. + orderIndex := make(map[string]int, len(orderedTypes)) + for i, rt := range orderedTypes { + orderIndex[rt] = i + } + + // Build rows in explicit order first, then append remaining alphabetically. + rows := make([]QuotaRow, 0, len(bucketByType)) + seen := make(map[string]bool, len(bucketByType)) + + appendRow := func(rt string, b *quotav1alpha1.AllowanceBucket) { + if seen[rt] { + return + } + seen[rt] = true + + displayName := resourceTypeSuffix(rt) + unit := "units" + var divisor int64 = 1 + + if m, ok := meta[rt]; ok { + if m.DisplayName != "" { + displayName = m.DisplayName + } + if m.Unit != "" { + unit = m.Unit + } + if m.Divisor > 1 { + divisor = m.Divisor + } + } else if rr, ok := rrByType[rt]; ok && rr.Spec.DisplayUnit != "" && rr.Spec.DisplayUnit != "1" { + unit = rr.Spec.DisplayUnit + } + + rows = append(rows, QuotaRow{ + ResourceType: rt, + DisplayName: displayName, + Unit: unit, + Limit: b.Status.Limit / divisor, + Used: b.Status.Allocated / divisor, + Available: b.Status.Available / divisor, + }) + } + + for _, rt := range orderedTypes { + if b, ok := bucketByType[rt]; ok { + appendRow(rt, b) + } + } + // Append any buckets not covered by orderedTypes. + remaining := make([]string, 0) + for rt := range bucketByType { + if !seen[rt] { + remaining = append(remaining, rt) + } + } + // Stable alphabetical order for the tail. + for i := 0; i < len(remaining)-1; i++ { + for j := i + 1; j < len(remaining); j++ { + if remaining[i] > remaining[j] { + remaining[i], remaining[j] = remaining[j], remaining[i] + } + } + } + for _, rt := range remaining { + appendRow(rt, bucketByType[rt]) + } + + return rows, nil +} + +// resourceTypeSuffix derives a human-readable name from the last segment of a +// resource type string (e.g. "compute.datumapis.com/vcpus" → "vcpus"). +func resourceTypeSuffix(resourceType string) string { + if idx := strings.LastIndex(resourceType, "/"); idx >= 0 { + return resourceType[idx+1:] + } + return resourceType +} diff --git a/internal/cmd/compute/util/table.go b/internal/cmd/compute/util/table.go new file mode 100644 index 00000000..6402c44d --- /dev/null +++ b/internal/cmd/compute/util/table.go @@ -0,0 +1,12 @@ +package util + +import ( + "io" + "text/tabwriter" +) + +// NewTabWriter returns a *tabwriter.Writer configured for command table output. +// Use tab ('\t') as the column separator in rows. Caller must call Flush(). +func NewTabWriter(w io.Writer) *tabwriter.Writer { + return tabwriter.NewWriter(w, 0, 0, 3, ' ', 0) +} diff --git a/internal/cmd/compute/util/time.go b/internal/cmd/compute/util/time.go new file mode 100644 index 00000000..906af663 --- /dev/null +++ b/internal/cmd/compute/util/time.go @@ -0,0 +1,33 @@ +package util + +import ( + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// RelativeAge returns a compact age string for table cells (no "ago" suffix). +// +// < 60s → "Xs" +// < 60m → "Xm" +// < 24h → "Xh" +// >= 24h → "Xd" +func RelativeAge(t metav1.Time) string { + d := time.Since(t.Time) + switch { + case d < time.Minute: + return fmt.Sprintf("%ds", int(d.Seconds())) + case d < time.Hour: + return fmt.Sprintf("%dm", int(d.Minutes())) + case d < 24*time.Hour: + return fmt.Sprintf("%dh", int(d.Hours())) + default: + return fmt.Sprintf("%dd", int(d.Hours()/24)) + } +} + +// RelativeAgeVerbose returns an age string with "ago" suffix for detail views. +func RelativeAgeVerbose(t metav1.Time) string { + return RelativeAge(t) + " ago" +} diff --git a/internal/cmd/compute/watch/watch.go b/internal/cmd/compute/watch/watch.go new file mode 100644 index 00000000..13b2651d --- /dev/null +++ b/internal/cmd/compute/watch/watch.go @@ -0,0 +1,255 @@ +// Package watch provides a rollout progress watcher for compute workloads. +package watch + +import ( + "context" + "fmt" + "io" + "time" + + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +type deploymentPhase string + +const ( + phasePending deploymentPhase = "Pending" + phaseUpdating deploymentPhase = "Updating" + phaseDone deploymentPhase = "Done" + phaseBlocked deploymentPhase = "Blocked" +) + +type deploymentState struct { + placement string + city string + desired int32 + ready int32 + current int32 + phase deploymentPhase + stalledSince time.Time +} + +// Rollout polls WorkloadDeployment objects for the given workload UID, printing +// per-city progress rows as state changes. It returns when all deployments +// reach Done, or when ctx is cancelled (Ctrl-C detach). +func Rollout(ctx context.Context, c client.Client, out io.Writer, project string, workloadUID types.UID) error { + start := time.Now() + + selector := labels.SelectorFromSet(labels.Set{ + computev1alpha.WorkloadUIDLabel: string(workloadUID), + }) + + tw := util.NewTabWriter(out) + headerPrinted := false + states := map[string]*deploymentState{} + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + _ = tw.Flush() + _, _ = fmt.Fprintln(out, "Detached. Rollout continues in background.") + return nil + + case <-ticker.C: + var deployList computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &deployList, + client.InNamespace(util.ResourceNamespace), + client.MatchingLabelsSelector{Selector: selector}, + ); err != nil { + if ctx.Err() != nil { + return nil + } + // Transient error — keep polling. + continue + } + + if len(deployList.Items) == 0 { + continue + } + + if !headerPrinted { + _, _ = fmt.Fprintln(out, "\n PLACEMENT\tCITY\tUPDATED\tREADY\tOLD\tPHASE") + headerPrinted = true + } + + allDone := processDeployments(ctx, c, out, project, tw, states, deployList.Items) + + if allDone && len(deployList.Items) > 0 { + printElapsed(out, time.Since(start).Round(time.Second)) + return nil + } + } + } +} + +// tabFlusher is a writer that can also be flushed (e.g. tabwriter.Writer). +type tabFlusher interface { + io.Writer + Flush() error +} + +// processDeployments updates state for each deployment, prints changed rows, +// and returns true when every deployment has reached the Done phase. +func processDeployments( + ctx context.Context, + c client.Client, + out io.Writer, + project string, + tw tabFlusher, + states map[string]*deploymentState, + deployments []computev1alpha.WorkloadDeployment, +) bool { + allDone := true + for _, d := range deployments { + key := d.Spec.CityCode + prev, exists := states[key] + + desired := resolveDesired(d) + ready := d.Status.ReadyReplicas + current := d.Status.CurrentReplicas + + newPhase := computePhase(desired, ready, current, d.Status.Replicas) + + if !exists || prev.desired != desired || prev.ready != ready || prev.current != current || prev.phase != newPhase { + newPhase = updateDeploymentState(states, key, d, exists, prev, desired, ready, current, newPhase) + printDeploymentRow(ctx, c, out, project, tw, d, current, ready, newPhase) + } + + if newPhase != phaseDone { + allDone = false + } + } + return allDone +} + +// updateDeploymentState updates the states map for a deployment and returns the +// (possibly promoted) phase. +func updateDeploymentState( + states map[string]*deploymentState, + key string, + d computev1alpha.WorkloadDeployment, + exists bool, + prev *deploymentState, + desired, ready, current int32, + newPhase deploymentPhase, +) deploymentPhase { + st := &deploymentState{ + placement: d.Spec.PlacementName, + city: d.Spec.CityCode, + desired: desired, + ready: ready, + current: current, + phase: newPhase, + } + if exists { + st.stalledSince = prev.stalledSince + } + + // Track when we first noticed a potential stall. + if newPhase != phaseDone && newPhase != phasePending { + if !exists || prev.phase == phasePending { + st.stalledSince = time.Now() + } else if prev.ready == ready && prev.current == current { + st.stalledSince = prev.stalledSince + } else { + st.stalledSince = time.Now() + } + } + + // Promote to Blocked if stalled > 30s without progress. + if newPhase == phaseUpdating && !st.stalledSince.IsZero() && time.Since(st.stalledSince) > 30*time.Second { + st.phase = phaseBlocked + newPhase = phaseBlocked + } + + states[key] = st + return newPhase +} + +// printDeploymentRow writes a progress row and, if blocked, detail about the +// first non-ready instance. +func printDeploymentRow( + ctx context.Context, + c client.Client, + out io.Writer, + project string, + tw tabFlusher, + d computev1alpha.WorkloadDeployment, + current, ready int32, + newPhase deploymentPhase, +) { + old := max(d.Status.Replicas-d.Status.CurrentReplicas, 0) + + _, _ = fmt.Fprintf(tw, " %s\t%s\t%d\t%d\t%d\t%s\n", + d.Spec.PlacementName, + d.Spec.CityCode, + current, + ready, + old, + string(newPhase), + ) + _ = tw.Flush() + + if newPhase == phaseBlocked { + printBlockedDetail(ctx, c, out, project, d) + } +} + +// printElapsed writes the total rollout duration to out. +func printElapsed(out io.Writer, elapsed time.Duration) { + minutes := int(elapsed.Minutes()) + seconds := int(elapsed.Seconds()) % 60 + if minutes > 0 { + _, _ = fmt.Fprintf(out, "Rollout complete in %dm %ds.\n", minutes, seconds) + } else { + _, _ = fmt.Fprintf(out, "Rollout complete in %ds.\n", seconds) + } +} + +// resolveDesired returns the replica count the rollout should wait for. +// Status.DesiredReplicas stays at zero until the controller first reconciles +// the deployment; until then, fall back to the spec minimum so a freshly +// created deployment isn't reported Done before any instances are scheduled. +// Once the controller has reported a desired count, trust it. +func resolveDesired(d computev1alpha.WorkloadDeployment) int32 { + if d.Status.DesiredReplicas == 0 { + return d.Spec.ScaleSettings.MinReplicas + } + return d.Status.DesiredReplicas +} + +func computePhase(desired, ready, current, replicas int32) deploymentPhase { + if desired == 0 { + return phaseDone + } + if ready >= desired && current >= desired && replicas <= current { + return phaseDone + } + if current == 0 { + return phasePending + } + return phaseUpdating +} + +// printBlockedDetail prints the blocking reason from the deployment's own +// Available condition. The server rolls up the underlying instance cause there, +// so no per-instance fetch is needed. +func printBlockedDetail(_ context.Context, _ client.Client, out io.Writer, _ string, d computev1alpha.WorkloadDeployment) { + reason, msg, blocked := util.ReadinessBlock(d.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + if !blocked { + return + } + if msg != "" { + fmt.Fprintf(out, " Blocked reason: %s — %s\n", reason, msg) + } else if reason != "" { + fmt.Fprintf(out, " Blocked reason: %s\n", reason) + } +} diff --git a/internal/cmd/compute/watch/watch_test.go b/internal/cmd/compute/watch/watch_test.go new file mode 100644 index 00000000..33734954 --- /dev/null +++ b/internal/cmd/compute/watch/watch_test.go @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package watch + +import ( + "testing" + "time" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +func TestResolveDesired(t *testing.T) { + tests := []struct { + name string + statusDesired int32 + specMinReplicas int32 + want int32 + }{ + { + name: "unreconciled status falls back to spec min", + statusDesired: 0, + specMinReplicas: 1, + want: 1, + }, + { + name: "genuine scale-to-zero", + statusDesired: 0, + specMinReplicas: 0, + want: 0, + }, + { + name: "controller desired above min (e.g. autoscaled)", + statusDesired: 3, + specMinReplicas: 1, + want: 3, + }, + { + // Once the controller has spoken, trust its value even if below spec min. + name: "controller desired below min — trust controller", + statusDesired: 1, + specMinReplicas: 2, + want: 1, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + d := computev1alpha.WorkloadDeployment{} + d.Status.DesiredReplicas = tc.statusDesired + d.Spec.ScaleSettings.MinReplicas = tc.specMinReplicas + + got := resolveDesired(d) + if got != tc.want { + t.Errorf("resolveDesired() = %d, want %d", got, tc.want) + } + }) + } +} + +func TestComputePhase(t *testing.T) { + tests := []struct { + name string + desired int32 + ready int32 + current int32 + replicas int32 + want deploymentPhase + }{ + { + name: "zero desired is Done", + desired: 0, + ready: 0, + current: 0, + replicas: 0, + want: phaseDone, + }, + { + name: "fresh create with no instances yet is Pending", + desired: 1, + ready: 0, + current: 0, + replicas: 0, + want: phasePending, + }, + { + name: "instance scheduled but not ready is Updating", + desired: 1, + ready: 0, + current: 1, + replicas: 1, + want: phaseUpdating, + }, + { + name: "single instance ready is Done", + desired: 1, + ready: 1, + current: 1, + replicas: 1, + want: phaseDone, + }, + { + // OLD replicas still draining after scale-down must not report Done. + name: "scale-down with old replicas still draining is Updating", + desired: 1, + ready: 1, + current: 1, + replicas: 5, + want: phaseUpdating, + }, + { + name: "partial readiness is Updating", + desired: 3, + ready: 1, + current: 2, + replicas: 3, + want: phaseUpdating, + }, + { + name: "all replicas ready is Done", + desired: 2, + ready: 2, + current: 2, + replicas: 2, + want: phaseDone, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := computePhase(tc.desired, tc.ready, tc.current, tc.replicas) + if got != tc.want { + t.Errorf("computePhase(%d, %d, %d, %d) = %q, want %q", + tc.desired, tc.ready, tc.current, tc.replicas, got, tc.want) + } + }) + } +} + +func TestUpdateDeploymentState(t *testing.T) { + const key = "DFW" + + makeDeployment := func() computev1alpha.WorkloadDeployment { + var d computev1alpha.WorkloadDeployment + d.Spec.PlacementName = "default" + d.Spec.CityCode = key + return d + } + + t.Run("stalled updating for 40s is promoted to Blocked", func(t *testing.T) { + states := map[string]*deploymentState{} + stalledAt := time.Now().Add(-40 * time.Second) + states[key] = &deploymentState{ + phase: phaseUpdating, + ready: 1, + current: 2, + stalledSince: stalledAt, + } + + got := updateDeploymentState(states, key, makeDeployment(), true, states[key], 3, 1, 2, phaseUpdating) + + if got != phaseBlocked { + t.Errorf("updateDeploymentState() phase = %q, want %q", got, phaseBlocked) + } + if states[key].phase != phaseBlocked { + t.Errorf("states[key].phase = %q, want %q", states[key].phase, phaseBlocked) + } + }) + + t.Run("first observation of Updating is not yet Blocked", func(t *testing.T) { + states := map[string]*deploymentState{} + + got := updateDeploymentState(states, key, makeDeployment(), false, nil, 3, 1, 2, phaseUpdating) + + if got != phaseUpdating { + t.Errorf("updateDeploymentState() phase = %q, want %q", got, phaseUpdating) + } + if states[key].phase != phaseUpdating { + t.Errorf("states[key].phase = %q, want %q", states[key].phase, phaseUpdating) + } + }) +} diff --git a/internal/cmd/compute/workloads/workloads.go b/internal/cmd/compute/workloads/workloads.go new file mode 100644 index 00000000..ba4d85e4 --- /dev/null +++ b/internal/cmd/compute/workloads/workloads.go @@ -0,0 +1,473 @@ +package workloads + +import ( + "context" + "fmt" + "strings" + + "github.com/spf13/cobra" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/cmd/compute/util" +) + +// Command returns the top-level "workloads" command group. +func Command() *cobra.Command { + cmd := &cobra.Command{ + Use: "workloads", + Short: "List or inspect workloads", + Long: `List all workloads in the project, optionally filtered by health or city. +Use the describe subcommand for a unified config + health view of a single workload.`, + Example: ` # List all workloads + datumctl compute workloads + + # Filter by health + datumctl compute workloads --health=degraded + + # Filter by city + datumctl compute workloads --city=DFW + + # Machine-readable output + datumctl compute workloads -o json + + # Describe a single workload + datumctl compute workloads describe api`, + RunE: func(cmd *cobra.Command, args []string) error { + return runList(cmd, args) + }, + } + + // List flags. + cmd.Flags().String("health", "", "Filter by health: available, degraded, progressing, unknown") + cmd.Flags().String("city", "", "Filter to workloads with a placement in this city") + cmd.Flags().StringP("output", "o", "table", "Output format: table, wide, json, yaml") + cmd.Flags().Bool("no-headers", false, "Omit the table header row (table and wide only)") + + _ = cmd.RegisterFlagCompletionFunc("health", util.CompleteOutputFormats("available", "degraded", "progressing", "unknown")) + _ = cmd.RegisterFlagCompletionFunc("city", util.CompleteCityCodes) + _ = cmd.RegisterFlagCompletionFunc("output", util.CompleteOutputFormats("table", "wide", "json", "yaml")) + + cmd.AddCommand(describeCommand()) + + return cmd +} + +// ----------------------------------------------------------------------- +// workloads list +// ----------------------------------------------------------------------- + +func runList(cmd *cobra.Command, _ []string) error { + ctx := context.Background() + project := util.ProjectFromCmd(cmd) + + outputFlag, _ := cmd.Flags().GetString("output") + healthFilter, _ := cmd.Flags().GetString("health") + cityFilter, _ := cmd.Flags().GetString("city") + noHeaders, _ := cmd.Flags().GetBool("no-headers") + + c, err := util.NewClient(project) + if err != nil { + return err + } + + var wlList computev1alpha.WorkloadList + if err := c.List(ctx, &wlList, client.InNamespace(util.ResourceNamespace)); err != nil { + return fmt.Errorf("listing workloads: %w", err) + } + + // JSON/YAML: emit raw API resource and return early. + switch util.OutputFormat(outputFlag) { + case util.OutputJSON: + return util.PrintJSON(cmd.OutOrStdout(), &wlList) + case util.OutputYAML: + return util.PrintYAML(cmd.OutOrStdout(), &wlList) + } + + // For table output we need deployment data to compute READY counts. + var deployList computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &deployList, client.InNamespace(util.ResourceNamespace)); err != nil { + return fmt.Errorf("listing deployments: %w", err) + } + + // Build map: workloadUID → []WorkloadDeployment. + deploysByWorkload := make(map[string][]computev1alpha.WorkloadDeployment) + for _, d := range deployList.Items { + wUID := d.Labels[computev1alpha.WorkloadUIDLabel] + deploysByWorkload[wUID] = append(deploysByWorkload[wUID], d) + } + + // City filter: collect the set of workload UIDs that have a deployment in + // the requested city code. + cityFilteredUIDs := map[string]bool{} + if cityFilter != "" { + for _, d := range deployList.Items { + if d.Spec.CityCode == cityFilter { + wUID := d.Labels[computev1alpha.WorkloadUIDLabel] + cityFilteredUIDs[wUID] = true + } + } + } + + type workloadRow struct { + name string + health string + healthShort string // first word, for filter comparison + readyStr string + upToDateStr string + placements string + image string + age string + instType string // wide only + } + + wide := util.OutputFormat(outputFlag) == util.OutputWide + + var rows []workloadRow + + for _, wl := range wlList.Items { + wUID := string(wl.UID) + + // City filter. + if cityFilter != "" && !cityFilteredUIDs[wUID] { + continue + } + + deps := deploysByWorkload[wUID] + var totalReady, totalUpdated, totalDesired int32 + for _, d := range deps { + totalReady += d.Status.ReadyReplicas + totalUpdated += d.Status.UpdatedReplicas + totalDesired += d.Status.DesiredReplicas + } + + health := util.WorkloadHealth(wl.Status.Conditions, totalReady, totalDesired) + healthShort := strings.SplitN(health, " ", 2)[0] // e.g. "Available", "Degraded" + + // Health filter. + if healthFilter != "" && !strings.EqualFold(healthShort, healthFilter) { + continue + } + + // Placement names. + var placementNames []string + for _, p := range wl.Spec.Placements { + placementNames = append(placementNames, p.Name) + } + placements := strings.Join(placementNames, ", ") + if placements == "" { + placements = "(none)" + } + + // Image from first container. + image := "(vm)" + if wl.Spec.Template.Spec.Runtime.Sandbox != nil && + len(wl.Spec.Template.Spec.Runtime.Sandbox.Containers) > 0 { + image = truncateImage(wl.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image) + } + + readyStr := fmt.Sprintf("%d/%d", totalReady, totalDesired) + upToDateStr := fmt.Sprintf("%d/%d", totalUpdated, totalDesired) + instType := wl.Spec.Template.Spec.Runtime.Resources.InstanceType + + rows = append(rows, workloadRow{ + name: wl.Name, + health: health, + healthShort: healthShort, + readyStr: readyStr, + upToDateStr: upToDateStr, + placements: placements, + image: image, + age: util.RelativeAge(wl.CreationTimestamp), + instType: instType, + }) + } + + // Tally health counts from the filtered rows (W9: count after filtering). + healthCounts := map[string]int{ + "Available": 0, + "Degraded": 0, + "Unavailable": 0, + "Unknown": 0, + } + for _, r := range rows { + switch r.healthShort { + case "Available": + healthCounts["Available"]++ + case "Degraded": + healthCounts["Degraded"]++ + case "Unavailable": + healthCounts["Unavailable"]++ + default: + healthCounts["Unknown"]++ + } + } + + out := cmd.OutOrStdout() + + if len(rows) == 0 { + if healthFilter != "" { + fmt.Fprintf(out, "No workloads in project %s match health=%s.\n", project, healthFilter) + } else if cityFilter != "" { + fmt.Fprintf(out, "No workloads in project %s have a placement in city %s.\n", project, cityFilter) + } else { + fmt.Fprintf(out, "No workloads found in project %s.\n\n", project) + fmt.Fprintf(out, "Get started:\n") + fmt.Fprintf(out, " datumctl compute deploy --name=api --image=ghcr.io/acme/api:v1.0.0 --city=DFW\n") + } + return nil + } + + tw := util.NewTabWriter(out) + if !noHeaders { + if wide { + fmt.Fprintf(tw, "NAME\tHEALTH\tREADY\tUP-TO-DATE\tPLACEMENTS\tIMAGE\tAGE\tINSTANCE TYPE\n") + } else { + fmt.Fprintf(tw, "NAME\tHEALTH\tREADY\tUP-TO-DATE\tPLACEMENTS\tIMAGE\tAGE\n") + } + } + for _, r := range rows { + if wide { + fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + r.name, r.healthShort, r.readyStr, r.upToDateStr, r.placements, r.image, r.age, r.instType) + } else { + fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + r.name, r.healthShort, r.readyStr, r.upToDateStr, r.placements, r.image, r.age) + } + } + _ = tw.Flush() + + // Footer summary. + fmt.Fprintf(out, "\n%d workloads — %d Available, %d Degraded, %d Unavailable, %d Unknown\n", + len(rows), + healthCounts["Available"], + healthCounts["Degraded"], + healthCounts["Unavailable"], + healthCounts["Unknown"], + ) + + return nil +} + +// truncateImage strips the registry host from an image reference so the table +// column stays compact. "ghcr.io/acme/api:v1" → "acme/api:v1". +func truncateImage(image string) string { + parts := strings.SplitN(image, "/", 2) + if len(parts) == 2 { + // Only strip the first component if it looks like a registry host + // (contains a '.' or ':' — as opposed to a Docker Hub org name). + host := parts[0] + if strings.ContainsAny(host, ".:") { + return parts[1] + } + } + return image +} + +// ----------------------------------------------------------------------- +// workloads describe +// ----------------------------------------------------------------------- + +func describeCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "describe ", + Short: "Show config and health for a single workload", + Long: `Display a unified view of workload configuration (container spec, scale settings) +and runtime health (per-city ready/desired counts). Replaces 'datumctl compute status'.`, + Args: cobra.ExactArgs(1), + Example: ` datumctl compute workloads describe api`, + RunE: func(cmd *cobra.Command, args []string) error { + return runDescribe(cmd, args) + }, + ValidArgsFunction: util.CompleteWorkloadNames, + } + + cmd.Flags().StringP("output", "o", "wide", "Output format: wide, json, yaml") + _ = cmd.RegisterFlagCompletionFunc("output", util.CompleteOutputFormats("wide", "json", "yaml")) + + return cmd +} + +func runDescribe(cmd *cobra.Command, args []string) error { + ctx := context.Background() + project := util.ProjectFromCmd(cmd) + outputFlag, _ := cmd.Flags().GetString("output") + + c, err := util.NewClient(project) + if err != nil { + return err + } + + workloadName := args[0] + + var wl computev1alpha.Workload + if err := c.Get(ctx, types.NamespacedName{Namespace: util.ResourceNamespace, Name: workloadName}, &wl); err != nil { + if k8serrors.IsNotFound(err) { + return fmt.Errorf("workload %q not found in project %s", workloadName, project) + } + return fmt.Errorf("getting workload: %w", err) + } + + // JSON / YAML: emit the raw resource. + switch util.OutputFormat(outputFlag) { + case util.OutputJSON: + return util.PrintJSON(cmd.OutOrStdout(), &wl) + case util.OutputYAML: + return util.PrintYAML(cmd.OutOrStdout(), &wl) + } + + // List deployments for this workload. + selector := labels.SelectorFromSet(labels.Set{computev1alpha.WorkloadUIDLabel: string(wl.UID)}) + var deployList computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &deployList, client.InNamespace(util.ResourceNamespace), client.MatchingLabelsSelector{Selector: selector}); err != nil { + return fmt.Errorf("listing deployments: %w", err) + } + + // Compute totals for health. + var totalDesired, totalReady int32 + for _, d := range deployList.Items { + totalDesired += d.Status.DesiredReplicas + totalReady += d.Status.ReadyReplicas + } + + health := util.WorkloadHealth(wl.Status.Conditions, totalReady, totalDesired) + + // Determine type label. + typeLabel := "virtual-machine" + if wl.Spec.Template.Spec.Runtime.Sandbox != nil { + instType := wl.Spec.Template.Spec.Runtime.Resources.InstanceType + if instType != "" { + typeLabel = "sandbox/" + instType + } else { + typeLabel = "sandbox" + } + } + + age := util.RelativeAgeVerbose(wl.CreationTimestamp) + + out := cmd.OutOrStdout() + + // Header block. + fmt.Fprintf(out, "%-12s %-31s project: %s\n", "Workload", workloadName, project) + fmt.Fprintf(out, "%-12s %s\n", "Type", typeLabel) + fmt.Fprintf(out, "%-12s %s\n", "Updated", age) + fmt.Fprintf(out, "\n") + fmt.Fprintf(out, "%-12s %s\n", "Health", health) + fmt.Fprintf(out, "\n") + + // Placements block. + fmt.Fprintf(out, "Placements\n") + if len(wl.Spec.Placements) == 0 { + fmt.Fprintf(out, " (none configured — workload will not run anywhere)\n") + } else { + // Build a map: placementName → []WorkloadDeployment. + deplsByPlacement := make(map[string][]computev1alpha.WorkloadDeployment) + for _, d := range deployList.Items { + deplsByPlacement[d.Spec.PlacementName] = append(deplsByPlacement[d.Spec.PlacementName], d) + } + + for _, p := range wl.Spec.Placements { + // Placement header line. + maxStr := "∞" + if p.ScaleSettings.MaxReplicas != nil { + maxStr = fmt.Sprintf("%d", *p.ScaleSettings.MaxReplicas) + } + cityCodes := strings.Join(p.CityCodes, ", ") + fmt.Fprintf(out, " %-10s cities: %-24s scale: %d..%s\n", + p.Name, cityCodes, p.ScaleSettings.MinReplicas, maxStr) + + // Per-city lines from deployments. + for _, d := range deplsByPlacement[p.Name] { + readyStr := fmt.Sprintf("%d/%d", d.Status.ReadyReplicas, d.Status.DesiredReplicas) + annotation := "" + if d.Status.ReadyReplicas < d.Status.DesiredReplicas { + // Read blocking reason from the deployment's own condition. + annotation = degradedAnnotation(ctx, c, d) + } + if annotation != "" { + fmt.Fprintf(out, " %-8s ready: %-10s %s\n", d.Spec.CityCode, readyStr, annotation) + } else { + fmt.Fprintf(out, " %-8s ready: %s\n", d.Spec.CityCode, readyStr) + } + } + } + } + fmt.Fprintf(out, "\n") + + // Container block (sandbox only). + if wl.Spec.Template.Spec.Runtime.Sandbox != nil && len(wl.Spec.Template.Spec.Runtime.Sandbox.Containers) > 0 { + ctr := wl.Spec.Template.Spec.Runtime.Sandbox.Containers[0] + fmt.Fprintf(out, "Container\n") + fmt.Fprintf(out, " %-10s %s\n", "Image", ctr.Image) + + if len(ctr.Ports) > 0 { + var portStrs []string + for _, p := range ctr.Ports { + proto := "TCP" + if p.Protocol != nil { + proto = string(*p.Protocol) + } + portStrs = append(portStrs, fmt.Sprintf("%d/%s", p.Port, proto)) + } + fmt.Fprintf(out, " %-10s %s\n", "Ports", strings.Join(portStrs, ", ")) + } + + if len(ctr.Env) > 0 { + fmt.Fprintf(out, " Env\n") + for _, e := range ctr.Env { + fmt.Fprintf(out, " %s\n", formatEnvVar(e)) + } + } + + // Resources. + instType := wl.Spec.Template.Spec.Runtime.Resources.InstanceType + if instType != "" { + fmt.Fprintf(out, " %-10s %s\n", "Resources", instType) + } + + fmt.Fprintf(out, "\n") + } + + // Next steps. + fmt.Fprintf(out, "Next steps:\n") + fmt.Fprintf(out, " %-25s datumctl compute instances --workload=%s\n", "List instances:", workloadName) + fmt.Fprintf(out, " %-25s datumctl compute logs \n", "Stream logs:") + fmt.Fprintf(out, " %-25s datumctl compute rollout undo %s\n", "Roll back:", workloadName) + + return nil +} + +// degradedAnnotation returns a short annotation for a per-city line when the +// deployment is not fully ready. It reads the blocking reason+message from the +// deployment's own Available condition, which the server rolls up from the +// underlying instances. No per-instance fetch or reason branching needed. +func degradedAnnotation(_ context.Context, _ client.Client, d computev1alpha.WorkloadDeployment) string { + reason, msg, blocked := util.ReadinessBlock(d.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + if !blocked { + return "" + } + if msg != "" { + return "Blocked — " + msg + } + if reason != "" { + return "Blocked — " + reason + } + return "Blocked" +} + +// formatEnvVar renders a single EnvVar for display. +func formatEnvVar(e corev1.EnvVar) string { + if e.ValueFrom != nil { + if e.ValueFrom.SecretKeyRef != nil { + return fmt.Sprintf("%-20s from secret %s", e.Name, e.ValueFrom.SecretKeyRef.Name) + } + if e.ValueFrom.ConfigMapKeyRef != nil { + return fmt.Sprintf("%-20s from configmap %s", e.Name, e.ValueFrom.ConfigMapKeyRef.Name) + } + } + return fmt.Sprintf("%-20s %s", e.Name, e.Value) +} diff --git a/internal/config/config.go b/internal/config/config.go index dddb7926..4a6e8e76 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -229,6 +229,23 @@ type DiscoveryConfig struct { // template when connecting to project control planes. When not provided, // the operator will use the in-cluster config. ProjectKubeconfigPath string `json:"projectKubeconfigPath"` + + // ClusterName is the stable, unique name for this edge cluster. It is + // stamped onto ResourceClaim objects so that each edge controller can + // distinguish its own claims from those created by other edge controllers + // in the same project control planes. + // + // Required when Mode is "milo". Optional in single mode; defaults to "single". + ClusterName string `json:"clusterName"` + + // QuotaKubeconfigPath is the path to the kubeconfig file used when creating + // ResourceClaim objects against Milo project control planes. When set it + // takes precedence over ProjectKubeconfigPath for quota calls. When both are + // unset, quota accounting is disabled. + // + // Use this field in deployments (mode: single or mode: milo) that need to + // talk to api.datum.net for quota enforcement. + QuotaKubeconfigPath string `json:"quotaKubeconfigPath"` } func SetDefaults_DiscoveryConfig(obj *DiscoveryConfig) { @@ -253,6 +270,36 @@ func (c *DiscoveryConfig) ProjectRestConfig() (*rest.Config, error) { return clientcmd.BuildConfigFromFlags("", c.ProjectKubeconfigPath) } +// QuotaRestConfig returns the REST config for quota ResourceClaim management +// against Milo project control planes. QuotaKubeconfigPath is preferred; if +// unset, ProjectKubeconfigPath is used as a fallback. +// +// Returns (nil, nil) when no credential path is configured at all — this is +// the intentional opt-out case and the caller should disable quota enforcement. +// +// Returns (nil, error) when a credential path IS configured but the file does +// not exist on disk. This is a misconfiguration (Secret not mounted, wrong +// path) that must not silently disable enforcement; callers should treat this +// as a fatal startup error. +func (c *DiscoveryConfig) QuotaRestConfig() (*rest.Config, error) { + path := c.QuotaKubeconfigPath + if path == "" { + path = c.ProjectKubeconfigPath + } + if path == "" { + // No credential path configured: intentional opt-out. Caller logs and + // disables enforcement. + return nil, nil + } + if _, err := os.Stat(path); os.IsNotExist(err) { + // Path explicitly configured but file absent: operator intended enforcement + // but the credential is missing (unmounted Secret, wrong path). Fail loud. + return nil, fmt.Errorf("quota kubeconfig path %q is configured but file does not exist: "+ + "ensure the quota credential Secret is mounted correctly", path) + } + return clientcmd.BuildConfigFromFlags("", path) +} + func init() { SchemeBuilder.Register(&WorkloadOperator{}) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5f586932..bff584a6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "os" + "path/filepath" "testing" "k8s.io/apimachinery/pkg/runtime" @@ -56,3 +58,68 @@ webhookServer: t.Error("TLS.CertDir was not defaulted") } } + +// TestQuotaRestConfig_NilWhenNoPath verifies that omitting quotaKubeconfigPath +// returns (nil, nil) — the intentional opt-out / enforcement-disabled case. +func TestQuotaRestConfig_NilWhenNoPath(t *testing.T) { + cfg := &DiscoveryConfig{} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() = non-nil, want nil (no path configured)") + } +} + +// TestQuotaRestConfig_ErrorWhenPathMissing verifies that explicitly setting a +// kubeconfig path that does not exist on disk returns a non-nil error (fail-loud). +// This reverses the old da63916 behavior of silently returning (nil, nil). +func TestQuotaRestConfig_ErrorWhenPathMissing(t *testing.T) { + cfg := &DiscoveryConfig{ + QuotaKubeconfigPath: "/nonexistent/path/quota.kubeconfig", + } + restCfg, err := cfg.QuotaRestConfig() + if err == nil { + t.Fatal("QuotaRestConfig() error = nil, want non-nil error when path is configured but file absent") + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() returned non-nil config alongside error") + } +} + +// TestQuotaRestConfig_SuccessWhenFileExists verifies that a configured path +// pointing to an existing (though minimal) kubeconfig file succeeds. +func TestQuotaRestConfig_SuccessWhenFileExists(t *testing.T) { + // Write a minimal kubeconfig that clientcmd can parse. + dir := t.TempDir() + kubeconfigPath := filepath.Join(dir, "quota.kubeconfig") + minimalKubeconfig := []byte(`apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://localhost:1234 + name: test +contexts: +- context: + cluster: test + user: test + name: test +current-context: test +users: +- name: test + user: {} +`) + if err := os.WriteFile(kubeconfigPath, minimalKubeconfig, 0600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg := &DiscoveryConfig{QuotaKubeconfigPath: kubeconfigPath} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg == nil { + t.Error("QuotaRestConfig() = nil, want non-nil when file exists") + } +} diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe88..7d9e1ae1 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,9 +33,10 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + // Index workload deployments by city code so that SubnetClaim/Subnet watch + // handlers can efficiently find deployments targeting the same city. + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil @@ -44,18 +48,12 @@ func deploymentWorkloadUIDIndexFunc(o client.Object) []string { } } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index e5bc3564..06120aaa 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -5,52 +5,163 @@ package controller import ( "context" "fmt" + "maps" "strings" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - ctrlsource "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" + quotametrics "go.datum.net/compute/internal/quota" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" + + // instanceQuotaClaimSourceLabel is stamped on ResourceClaim objects with the + // name of the edge cluster that created them. The claim watch predicate uses + // this label to filter out claims written by other edge controllers targeting + // the same project control planes. + instanceQuotaClaimSourceLabel = "compute.datumapis.com/source-cluster" + + // quotaResourceTypeInstances is the quota resource type for Instance count. + quotaResourceTypeInstances = "compute.datumapis.com/instances" + + // miloProjectAPIGroup is the API group for Milo resource-manager resources. + miloProjectAPIGroup = "resourcemanager.miloapis.com" + + // miloProjectKind is the Kind used for Milo Project resources. + miloProjectKind = "Project" + + // msgNotProgrammed is the human-readable message for the not-programmed state. + msgNotProgrammed = "Instance has not been programmed" + + // msgInstanceReady is the human-readable message for the ready state. + msgInstanceReady = "Instance is ready" + + // msgInstanceProgrammed is the human-readable message for the programmed state. + msgInstanceProgrammed = "Instance has been programmed" + + // msgInstanceAvailable is the human-readable message for the available state. + msgInstanceAvailable = "Instance is available" + + // reasonNetworkFailedToCreate is the reason code for network creation failure. + reasonNetworkFailedToCreate = "NetworkFailedToCreate" +) + +const ( + instanceAPIGroup = "compute.datumapis.com" + instanceKind = "Instance" + + instanceNotProgrammedMessage = "Instance has not been programmed" + instanceNetworkFailedReason = "NetworkFailedToCreate" + instanceReadyMessage = "Instance is ready" +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. type clusterGetter interface { - GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) + GetCluster(ctx context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) } +// InstanceProjectIDFunc derives the Milo project ID for a given Instance. +// In Milo mode the project ID equals the multicluster ClusterName. In +// single-cell mode it is decoded from the upstream-cluster-name namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectIDFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + +// InstanceProjectNamespaceFunc derives the in-project namespace where +// ResourceClaims for a given Instance should be created. In Milo mode this +// equals instance.Namespace. In single-cell mode it comes from the +// upstream-namespace namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectNamespaceFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + // InstanceReconciler reconciles an Instance object type InstanceReconciler struct { - mgr clusterGetter - managementCluster cluster.Cluster + mgr clusterGetter + scheme *runtime.Scheme + quotaClientManager *quotametrics.ProjectQuotaClientManager + edgeClusterName string + // recorder emits Kubernetes events on the Instance object for quota failure + // modes so operators can diagnose issues via `kubectl describe`. + recorder record.EventRecorder + // projectIDForInstance derives the Milo project ID used for quota + // ResourceClaim management. In Milo mode it returns string(clusterName); in + // single-cell mode it reads the upstream-cluster-name label from the edge + // namespace and decodes "cluster-" → "". + projectIDForInstance InstanceProjectIDFunc + // projectNamespaceForInstance derives the in-project namespace where + // ResourceClaims must be created. In Milo mode the ResourceClaim lives in + // instance.Namespace (the project-level namespace); in single-cell mode the + // edge namespace is ns-{uid} which does not exist in the project control + // plane — the real namespace is the upstream-namespace label value (e.g. + // "default"). When nil, falls back to instance.Namespace. + projectNamespaceForInstance InstanceProjectNamespaceFunc + // clusterNameForProject maps a Milo project ID back to the multicluster + // ClusterName that owns that project's workloads. In Milo mode the + // ClusterName equals the project ID. In single-cell mode the only registered + // cluster is "single" regardless of project ID. When nil, falls back to + // multicluster.ClusterName(projectID), which is correct for Milo mode. + clusterNameForProject func(projectID string) multicluster.ClusterName + // FederationClient is an optional client pointing at the upstream + // Karmada/federation control plane (configured via --federation-kubeconfig). + // When non-nil, the reconciler writes a copy of each Instance back to the + // federation control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + FederationClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/finalizers,verbs=update // +kubebuilder:rbac:groups=quota.miloapis.com,resources=resourceclaims,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -69,29 +180,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), req.ClusterName, &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,94 +208,439 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, quotaErr := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) + + // Even when reconcileQuotaCondition returns a transient error, persist any + // condition change first so the failure reason is visible on the Instance. + // We return the error afterwards so controller-runtime requeues with backoff. + readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return ctrl.Result{}, err + } + + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + // Return with the quota error (nil or transient) so controller-runtime + // requeues with backoff on failures. On the success path (quotaErr==nil) + // we fall through to removeQuotaSchedulingGate below instead of returning + // early, so the gate is cleared in the same reconcile pass rather than + // waiting for a requeue that may never come (ResourceClaim is immutable + // and local Instances are not watched). + if quotaErr != nil { + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, quotaErr + } + } else if quotaErr != nil { + // No status change but quota evaluation failed — return error to requeue. + return ctrl.Result{}, quotaErr + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + if r.quotaClientManager != nil { + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project ID during deletion: %w", err) + } + if projectID == "" { + // Cannot locate the claim without a project ID. Log at ERROR and emit an + // event so the operator is aware of the orphaned claim. Fall through to + // finalizer removal so the Instance is not permanently stuck in Terminating. + // The orphaned claim will count against project budget until Milo's TTL/GC + // removes it. + log.FromContext(ctx).Error(nil, "project ID unresolvable during deletion; ResourceClaim may be orphaned — budget leak possible", + "instance", instance.Name, "namespace", instance.Namespace) + r.recorder.Event(instance, corev1.EventTypeWarning, + "QuotaClaimOrphaned", + "Skipping ResourceClaim cleanup: project ID could not be resolved; claim may be orphaned in Milo project control plane") + quotametrics.ClaimOrphanedTotal.Inc() + } else { + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + return fmt.Errorf("failed getting quota client for deletion: %w", err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project namespace during deletion: %w", err) + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + var claim quotav1alpha1.ResourceClaim + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: claimNamespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + } else { + if err := projectClient.Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } + } + } } - statusChanged := false + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns (changed, err) where +// changed=true means a status update is required, and err non-nil means the +// reconciler should requeue (with backoff) in addition to writing the condition. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, claimErr := r.reconcileQuotaClaim(ctx, clusterName, instance) + + // reconcileQuotaClaim returns (condition, err). A non-nil error signals a + // transient infrastructure failure; a non-nil condition carries the reason to + // write. Both can be non-nil: write the condition AND requeue with backoff. switch { - case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition == nil && claimErr == nil: + // No claim yet and no error: labels not yet propagated. Stay PendingEvaluation. + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, + }), nil + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionFalse && + grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason: + // Claim exists but pending — no AllowanceBucket. Distinct from "evaluating". + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceQuotaGrantedReasonNoBudget, + Message: "ResourceClaim is pending: no AllowanceBucket configured for this project", + ObservedGeneration: instance.Generation, }) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonNoBudget, + "ResourceClaim pending: no AllowanceBucket configured for this project") + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonNoBudget).Inc() + return changed, claimErr - case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition != nil && grantedCondition.Type == computev1alpha.InstanceQuotaGranted: + // reconcileQuotaClaim populated a structured failure condition. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: grantedCondition.Status, + Reason: grantedCondition.Reason, + Message: grantedCondition.Message, + ObservedGeneration: instance.Generation, + }) + return changed, claimErr + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionTrue: + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr - case grantedCondition.Status == metav1.ConditionFalse: + case grantedCondition != nil: // False, non-pending reason from ResourceClaim reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr + + default: // grantedCondition == nil && claimErr != nil — should not reach here + return false, claimErr } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +// It guards on ObservedGeneration to prevent a stale True condition from +// generation N unblocking a generation N+1 instance before quota for the +// new spec has been evaluated. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + // Stale condition guard: only remove the gate if the condition reflects the + // current spec generation. A condition from an older generation means quota + // has not yet been evaluated for the current spec. + if quotaGrantedCond.ObservedGeneration != instance.Generation { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + // Already gone — nothing to do. + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.FederationClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToUpstream copies the Instance spec and status to the upstream +// Karmada/federation control plane so that the InstanceProjector can aggregate +// state from all POP cells. It is a no-op when FederationClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToUpstream(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if r.FederationClient == nil { + return nil + } + + // Encode the POP-cell cluster name using the same convention as NSO's + // MappedNamespaceResourceStrategy: "cluster-" with "/" → "_". + // This is the fallback; the namespace label takes precedence when present. + encodedClusterName := "cluster-" + strings.ReplaceAll(string(clusterName), "/", "_") + + // Read the upstream project namespace name and cluster name from the namespace + // labels stamped by NSO's MappedNamespaceResourceStrategy. These carry the true + // project cluster name (e.g. "cluster-datum-cloud") and upstream namespace (e.g. + // "default"), which the InstanceProjector needs to find the right project cluster. + upstreamNamespace := instance.Namespace // fallback: cell namespace (ns-) + var downstreamNS corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err == nil { + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]; v != "" { + upstreamNamespace = v + } + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]; v != "" { + encodedClusterName = v } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + logger := log.FromContext(ctx) + missingLabels := []string{} + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + if instance.Labels[key] == "" { + missingLabels = append(missingLabels, key) } } + if len(missingLabels) > 0 { + logger.Info("instance is missing linking labels for write-back; projection owner-ref will not be set", + "instance", instance.Name, "namespace", instance.Namespace, + "missingLabels", missingLabels) + } - return ctrl.Result{}, nil + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + computev1alpha.WorkloadUIDLabel: instance.Labels[computev1alpha.WorkloadUIDLabel], + computev1alpha.WorkloadDeploymentUIDLabel: instance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + computev1alpha.InstanceIndexLabel: instance.Labels[computev1alpha.InstanceIndexLabel], + computev1alpha.WorkloadDeploymentNameLabel: instance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + computev1alpha.CityCodeLabel: instance.Labels[computev1alpha.CityCodeLabel], + computev1alpha.WorkloadNameLabel: instance.Labels[computev1alpha.WorkloadNameLabel], + computev1alpha.PlacementNameLabel: instance.Labels[computev1alpha.PlacementNameLabel], + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // Ensure the namespace exists in the downstream control plane before creating the Instance. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: instance.Namespace}} + if err := r.FederationClient.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed ensuring downstream namespace: %w", err) + } + if err := r.FederationClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) + } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Build a comparable map containing only the keys this function owns so that + // Karmada-managed labels on the existing object do not cause spurious updates. + ownedLabels := make(map[string]string, len(writeBack.Labels)) + for k := range writeBack.Labels { + ownedLabels[k] = existing.Labels[k] + } + + // Update spec + labels only if owned keys differ. + if !apiequality.Semantic.DeepEqual(existing.Spec, instance.Spec) || + !apiequality.Semantic.DeepEqual(ownedLabels, writeBack.Labels) { + existing.Spec = instance.Spec + // Merge writeBack.Labels into existing.Labels. Only keys owned by + // writeBackToUpstream are written; any labels Karmada or other actors + // have placed on the downstream object are preserved. + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + maps.Copy(existing.Labels, writeBack.Labels) + if err := r.FederationClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + } + + // Update status only if it differs. + if !apiequality.Semantic.DeepEqual(existing.Status, instance.Status) { + existing.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + } + + return nil } -func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { +// reconcileQuotaClaim attempts to create or observe a ResourceClaim for the +// given instance. It returns: +// - (nil, nil) — labels not yet propagated; caller sets PendingEvaluation +// - (condition, nil) — terminal condition (True/False/Unknown from claim or failure) +// - (condition, err) — condition to write + transient error to requeue with backoff +// +// The condition's Type field is always InstanceQuotaGranted when set by this function +// to distinguish it from ResourceClaim conditions returned directly. +func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (*metav1.Condition, error) { + if r.quotaClientManager == nil { + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, + Message: "Quota enforcement disabled: no credential configured", + }, nil + } + logger := log.FromContext(ctx) + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + // Transient: namespace API unreachable. Return structured condition + error. + msg := fmt.Sprintf("Could not resolve project ID: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project ID for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if projectID == "" { + // Labels not yet propagated — bootstrap transient, not an error. + return nil, nil + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + msg := fmt.Sprintf("Failed to build quota client for project %q: %v", projectID, err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting quota client for project %q: %w", projectID, err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + msg := fmt.Sprintf("Could not resolve project namespace: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project namespace for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if claimNamespace == "" { + return nil, nil + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) requests := []quotav1alpha1.ResourceRequest{ { - ResourceType: "compute.datumapis.com/instances", + ResourceType: quotaResourceTypeInstances, Amount: 1, }, } @@ -213,39 +664,99 @@ func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterNam desired := "av1alpha1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ Name: claimName, - Namespace: instance.Namespace, + Namespace: claimNamespace, + Labels: map[string]string{ + instanceQuotaClaimSourceLabel: r.edgeClusterName, + }, }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", - Name: clusterName, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instance.Name, - Namespace: instance.Namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, Requests: requests, }, } var existing quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { - if !apierrors.IsNotFound(err) { - return nil, fmt.Errorf("failed getting resource claim: %w", err) - } - if err := r.managementCluster.GetClient().Create(ctx, desired); err != nil { - return nil, fmt.Errorf("failed creating resource claim: %w", err) + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { + if apierrors.IsNotFound(err) { + // Claim doesn't exist yet — attempt to create it. + createErr := projectClient.Create(ctx, desired) + if createErr == nil { + return nil, nil + } + return r.classifyCreateError(instance, projectID, claimNamespace, createErr) } - return nil, nil + // GET itself failed — treat as backend unavailable. + msg := fmt.Sprintf("Quota backend unreachable getting ResourceClaim: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting resource claim: %w", err) } grantedCondition := apimeta.FindStatusCondition(existing.Status.Conditions, quotav1alpha1.ResourceClaimGranted) return grantedCondition, nil } +// classifyCreateError maps a ResourceClaim creation error to a structured +// QuotaGranted condition with a specific reason, emits a Kubernetes event, and +// increments the appropriate metric counter. +func (r *InstanceReconciler) classifyCreateError( + instance *computev1alpha.Instance, + projectID, claimNamespace string, + err error, +) (*metav1.Condition, error) { + var reason, metricLabel, msg string + + switch { + case apierrors.IsNotFound(err): + // 404 on Create: either the project control plane path doesn't exist + // (project deleted) or the namespace doesn't exist yet. + if claimNamespace != "" { + // Namespace-level 404. + reason = computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound + metricLabel = quotametrics.ReasonNamespaceNotFound + msg = fmt.Sprintf("Quota claim namespace %q not found on project %q control plane", claimNamespace, projectID) + } else { + reason = computev1alpha.InstanceQuotaGrantedReasonProjectNotFound + metricLabel = quotametrics.ReasonProjectNotFound + msg = fmt.Sprintf("Milo project %q not found", projectID) + } + case apierrors.IsForbidden(err) || apierrors.IsInvalid(err): + // 403/422: quota admission plugin rejected the claim. + reason = computev1alpha.InstanceQuotaGrantedReasonMisconfigured + metricLabel = quotametrics.ReasonMisconfigured + msg = fmt.Sprintf("Quota admission rejected ResourceClaim for project %q: %v", projectID, err) + default: + // Connectivity or server error — treat as backend unavailable. + reason = computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable + metricLabel = quotametrics.ReasonBackendUnavailable + msg = fmt.Sprintf("Quota backend unreachable creating ResourceClaim: %v", err) + } + + r.recorder.Event(instance, corev1.EventTypeWarning, reason, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(metricLabel).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: reason, + Message: msg, + }, fmt.Errorf("failed creating resource claim: %w", err) +} + func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime if rt.Sandbox != nil { @@ -304,7 +815,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( ObservedGeneration: instance.Generation, }) changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingQuota, Message: msg, @@ -327,7 +838,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, ObservedGeneration: instance.Generation, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, } } else { readyCondition = readyCondition.DeepCopy() @@ -344,8 +855,9 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = "NetworkFailedToCreate" + readyCondition.Reason = reasonNetworkFailedToCreate readyCondition.Message = networkCreationFailureMessage } else { readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent @@ -360,12 +872,13 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason } - readyCondition.Message = "Instance has not been programmed" + readyCondition.Message = msgNotProgrammed if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { readyCondition.Message = programmedCondition.Message } @@ -375,26 +888,27 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( logger.Info("instance is programmed", "instance", instance.Name) - runningCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceRunning) - if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { - logger.Info("instance is not running", "instance", instance.Name) + availableCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceAvailable) + if availableCondition == nil || availableCondition.Status != metav1.ConditionTrue { + logger.Info("instance is not available", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason - if runningCondition != nil && runningCondition.Reason != pendingReason { - readyCondition.Reason = runningCondition.Reason + if availableCondition != nil && availableCondition.Reason != pendingReason { + readyCondition.Reason = availableCondition.Reason } - readyCondition.Message = "Instance is not running" - if runningCondition != nil && runningCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = runningCondition.Message + readyCondition.Message = "Instance is not available" + if availableCondition != nil && availableCondition.Status != metav1.ConditionUnknown { + readyCondition.Message = availableCondition.Message } return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } readyCondition.Status = metav1.ConditionTrue - readyCondition.Reason = computev1alpha.InstanceReadyReasonRunning - readyCondition.Message = "Instance is ready" + readyCondition.Reason = computev1alpha.InstanceReadyReasonAvailable + readyCondition.Message = msgInstanceReady return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -428,7 +942,7 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, } condition := apimeta.FindStatusCondition(networkBinding.Status.Conditions, networkingv1alpha.NetworkBindingReady) - if condition != nil && condition.Status == metav1.ConditionFalse && condition.Reason == "NetworkFailedToCreate" { + if condition != nil && condition.Status == metav1.ConditionFalse && condition.Reason == instanceNetworkFailedReason { return true, condition.Message, nil } } @@ -436,38 +950,118 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, return false, "", nil } +// resolveProjectID returns the Milo project ID to use for quota calls. +// When projectIDForInstance is set it delegates to that function; otherwise it +// falls back to string(clusterName), which is correct for Milo-mode deployments +// where the cluster name IS the project name. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectID(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectIDForInstance != nil { + return r.projectIDForInstance(ctx, clusterName, instance) + } + return string(clusterName), nil +} + +// resolveProjectNamespace returns the namespace within the Milo project control +// plane where ResourceClaims for this instance should be created. +// When projectNamespaceForInstance is set it delegates to that function; +// otherwise it falls back to instance.Namespace, which is correct for +// Milo-mode deployments where the project-side namespace already matches the +// instance namespace. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectNamespace(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectNamespaceForInstance != nil { + return r.projectNamespaceForInstance(ctx, clusterName, instance) + } + return instance.Namespace, nil +} + +// resolveClusterNameForProject returns the multicluster ClusterName for the +// given project ID. When clusterNameForProject is set it delegates to that +// function; otherwise it falls back to multicluster.ClusterName(projectID), +// which is correct for Milo-mode deployments where the cluster name IS the +// project name. +func (r *InstanceReconciler) resolveClusterNameForProject(projectID string) multicluster.ClusterName { + if r.clusterNameForProject != nil { + return r.clusterNameForProject(projectID) + } + return multicluster.ClusterName(projectID) +} + // SetupWithManager sets up the controller with the Manager. -func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementCluster cluster.Cluster) error { +// +// quotaRestConfig is the REST config used to reach Milo project control planes +// for ResourceClaim management. Pass nil to disable quota accounting. +// +// projectIDForInstance derives the Milo project ID for each reconcile request. +// In Milo mode pass nil (falls back to using ClusterName). In single-cell mode +// pass a function that returns instance.Namespace. +// +// clusterNameForProject maps a project ID back to the multicluster ClusterName. +// In Milo mode pass nil (falls back to ClusterName(projectID)). In single-cell +// mode pass a function that always returns "single". +func (r *InstanceReconciler) SetupWithManager( + mgr mcmanager.Manager, + quotaRestConfig *rest.Config, + projectIDForInstance InstanceProjectIDFunc, + projectNamespaceForInstance InstanceProjectNamespaceFunc, + edgeClusterName string, + clusterNameForProject func(projectID string) multicluster.ClusterName, +) error { r.mgr = mgr - r.managementCluster = managementCluster - - // Watch ResourceClaim objects on the management cluster directly, bypassing - // the multicluster clusterInjectingQueue which would overwrite ClusterName. - // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request - // values with the correct ClusterName taken from claim.Spec.ConsumerRef.Name. - claimSource := ctrlsource.TypedKind( - managementCluster.GetCache(), - "av1alpha1.ResourceClaim{}, - handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, claim *quotav1alpha1.ResourceClaim) []mcreconcile.Request { - if claim.Spec.ResourceRef.Kind != "Instance" || claim.Spec.ResourceRef.APIGroup != "compute.datumapis.com" { - return nil - } - return []mcreconcile.Request{ - { - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: claim.Spec.ResourceRef.Name, - Namespace: claim.Spec.ResourceRef.Namespace, - }, - }, - ClusterName: claim.Spec.ConsumerRef.Name, - }, - } - }), - ) + r.scheme = mgr.GetLocalManager().GetScheme() + //nolint:staticcheck // GetEventRecorder (new events API) has an incompatible Eventf + // signature (requires related object + action args) that would require migrating + // all emit sites. GetEventRecorderFor remains correct; migration is deferred. + r.recorder = mgr.GetLocalManager().GetEventRecorderFor("instance-controller") + r.edgeClusterName = edgeClusterName + r.projectIDForInstance = projectIDForInstance + r.projectNamespaceForInstance = projectNamespaceForInstance + r.clusterNameForProject = clusterNameForProject + if quotaRestConfig != nil { + if edgeClusterName == "" { + return fmt.Errorf("edgeClusterName must be set when quota enforcement is enabled; set discovery.clusterName in the server config") + } + r.quotaClientManager = quotametrics.New(quotaRestConfig) + } + + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + + edgeClusterNameVal := r.edgeClusterName return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Instance{}, mcbuilder.WithEngageWithLocalCluster(false)). - WatchesRawSource(claimSource). + Watches( + "av1alpha1.ResourceClaim{}, + func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []mcreconcile.Request { + claim := obj.(*quotav1alpha1.ResourceClaim) + if claim.Spec.ResourceRef.Name == "" { + return nil + } + return []mcreconcile.Request{ + { + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: claim.Spec.ResourceRef.Namespace, + Name: claim.Spec.ResourceRef.Name, + }, + }, + ClusterName: r.resolveClusterNameForProject(claim.Spec.ConsumerRef.Name), + }, + } + }, + ) + }, + mcbuilder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal + })), + ). Complete(r) } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1a15090b..3f833766 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "net/http" "testing" "github.com/stretchr/testify/assert" @@ -12,50 +11,39 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/quota" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} +// Test constants for repeated string literals across controller package tests. +const ( + testInstanceName = "test-instance" + testReasonString = "TestReason" + testMessageString = "Test message" + testUIDString = "test-uid" + testInstanceType = "d1-standard-2" + testDefaultPlacement = "default" + testDefaultNamespace = "default" + testEdgeClusterName = "test-edge" + testComputeAPIVersion = "compute.datumapis.com/v1alpha" + testQuotaAPIGroup = "quota.miloapis.com" + testQuotaResource = "resourceclaims" + kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment +) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { @@ -79,8 +67,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance without ready condition should create default", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, }, @@ -89,7 +77,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, }, }, @@ -97,8 +85,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates should set scheduling gates present", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -114,7 +102,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -134,8 +122,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates and network failure should set network failed", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -153,7 +141,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "NetworkFailedToCreate", + Reason: reasonNetworkFailedToCreate, Message: "Network creation failed: timeout", ObservedGeneration: 1, }, @@ -162,8 +150,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance not programmed should set pending programming", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -171,8 +159,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -181,8 +169,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -190,8 +178,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance programmed but not running should wait for running", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -200,13 +188,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -215,8 +203,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -224,8 +212,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance fully ready should set ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -234,13 +222,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Reason: computev1alpha.InstanceAvailableReasonAvailable, + Message: msgInstanceAvailable, }, }, }, @@ -249,8 +237,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -258,8 +246,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "no change when condition already matches", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -267,8 +255,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: msgInstanceReady, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -276,13 +264,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Reason: computev1alpha.InstanceAvailableReasonAvailable, + Message: msgInstanceAvailable, }, }, }, @@ -291,8 +279,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -343,8 +331,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota denied blocks ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -360,14 +348,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Reason: computev1alpha.InstanceAvailableReasonAvailable, + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -385,8 +373,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota available does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -402,14 +390,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { - Type: computev1alpha.InstanceRunning, + Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Reason: computev1alpha.InstanceAvailableReasonAvailable, + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -419,16 +407,16 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, - Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: msgInstanceReady, }, }, { name: "quota pending unknown does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -448,7 +436,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, }, }, } @@ -501,25 +489,28 @@ func TestReconcileQuota(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, Namespace: namespace, - UID: "test-uid", + UID: testUIDString, }, } } // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, - Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Namespace: testDefaultNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "compute.datumapis.com/v1alpha", - Kind: "WorkloadDeployment", + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, Name: deploymentName, - UID: "test-uid", + UID: testUIDString, Controller: func() *bool { b := true; return &b }(), }, }, @@ -529,7 +520,7 @@ func TestReconcileQuota(t *testing.T) { SchedulingGates: gates, }, Runtime: computev1alpha.InstanceRuntimeSpec{ - Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, }, NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, }, @@ -544,18 +535,21 @@ func TestReconcileQuota(t *testing.T) { }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, Name: clusterName, }, + // ResourceRef points at the Project resource (cluster-scoped), not the + // Instance. The quota admission plugin validates against the + // ResourceRegistration's claimingResources, which only allows + // resourcemanager.miloapis.com/Project. ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instanceName, - Namespace: namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: clusterName, }, Requests: []quotav1alpha1.ResourceRequest{ - {ResourceType: "compute.datumapis.com/instances", Amount: 1}, + {ResourceType: quotaResourceTypeInstances, Amount: 1}, }, }, Status: quotav1alpha1.ResourceClaimStatus{ @@ -572,7 +566,7 @@ func TestReconcileQuota(t *testing.T) { } } - newReconciler := func(t *testing.T, projectObjs []client.Object, mgmtObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { + newReconciler := func(t *testing.T, projectObjs []client.Object, quotaObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { t.Helper() s := newTestScheme(t) @@ -582,26 +576,44 @@ func TestReconcileQuota(t *testing.T) { WithStatusSubresource(&computev1alpha.Instance{}). Build() - mgmtClient := fake.NewClientBuilder(). + quotaClient := fake.NewClientBuilder(). WithScheme(s). - WithObjects(mgmtObjs...). + WithObjects(quotaObjs...). WithStatusSubresource("av1alpha1.ResourceClaim{}). Build() mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ - mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + // Milo mode: project ID == ClusterName; claim namespace == instance.Namespace. + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + // nil → falls back to instance.Namespace, which is correct for Milo mode. + projectNamespaceForInstance: nil, } - return r, projectClient, mgmtClient + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + return r, projectClient, quotaClient } - t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True", func(t *testing.T) { + t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True in single reconcile", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -611,7 +623,10 @@ func TestReconcileQuota(t *testing.T) { r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) - // First reconcile: sets QuotaGranted=True in status, returns early. + // Single reconcile: sets QuotaGranted=True in status AND removes the + // Quota scheduling gate in the same pass. The early-return-before-gate- + // removal bug required a second reconcile that never arrived because + // ResourceClaims are immutable and local Instances are not watched. _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -623,19 +638,13 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) - // Second reconcile: status is already set, so removes the scheduling gate. - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) - hasQuotaGate := false for _, g := range updated.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate must be removed in the same reconcile pass as the status update") }) t.Run("quota exceeded flow: conditions cascade to block Programmed/Running/Ready", func(t *testing.T) { @@ -664,10 +673,10 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionFalse, programmedCond.Status) assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, programmedCond.Reason) - runningCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceRunning) - require.NotNil(t, runningCond) - assert.Equal(t, metav1.ConditionFalse, runningCond.Status) - assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, runningCond.Reason) + availableCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceAvailable) + require.NotNil(t, availableCond) + assert.Equal(t, metav1.ConditionFalse, availableCond.Status) + assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, availableCond.Reason) readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceReady) require.NotNil(t, readyCond) @@ -709,7 +718,9 @@ func TestReconcileQuota(t *testing.T) { } require.NoError(t, mgmtClient.Status().Update(context.Background(), &existingClaim)) - // Second reconcile should see granted claim and update status. + // Second reconcile should see the granted claim, update status to + // QuotaGranted=True, AND remove the gate in the same pass (no third + // reconcile required). _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -719,28 +730,41 @@ func TestReconcileQuota(t *testing.T) { require.NotNil(t, quotaCond) assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) - // Third reconcile removes the gate (status is already true, no more status write needed). - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &recovered)) hasQuotaGate := false for _, g := range recovered.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed after quota granted") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate should be removed in the same reconcile pass that sets QuotaGranted=True") }) t.Run("deleted before grant: finalizer deletes claim and is removed", func(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) @@ -766,3 +790,783 @@ func TestReconcileQuota(t *testing.T) { } }) } + +// TestQuotaGateRemovedInSingleReconcile is a regression test for the bug where +// the Quota scheduling gate was never removed from an Instance after quota was +// granted. The root cause was an early return in the Reconcile function: when +// reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code +// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// Because ResourceClaims are immutable (no further transitions) and local +// Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever +// arrived — leaving the Quota gate stranded in spec.controller.schedulingGates +// and the projected Instance stuck "Pending (SchedulingGatesPresent)". +// +// The fix: on the success path (quotaErr==nil), fall through to +// removeQuotaSchedulingGate after persisting the status update, so gate removal +// happens in the same reconcile pass as the QuotaGranted=True status write. +func TestQuotaGateRemovedInSingleReconcile(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + deploymentName = "my-deployment" + ) + + claimName := namespace + "--" + instanceName + + tests := []struct { + name string + initialGates []computev1alpha.SchedulingGate + expectGateGone bool + }{ + { + name: "Quota gate only: removed in single reconcile when claim is granted", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "Quota gate plus Network gate: Quota removed, Network preserved", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "No gates: no-op, reconcile completes cleanly", + initialGates: []computev1alpha.SchedulingGate{}, + expectGateGone: false, // no gate to begin with + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Generation: 1, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: tt.initialGates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: namespace, UID: testUIDString}, + } + + // ResourceClaim already in QuotaAvailable state — simulates the state + // that triggered the bug: claim already granted but gate still present. + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota available", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Exactly one reconcile — must be sufficient to both set QuotaGranted=True + // and remove the Quota gate. No second reconcile should be required. + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + // QuotaGranted condition must be set to True. + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be present") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Quota gate must be gone after the single reconcile. + hasQuotaGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasQuotaGate = true + } + } + if tt.expectGateGone { + assert.False(t, hasQuotaGate, + "Quota gate must be removed in the same reconcile pass as the QuotaGranted=True status write; "+ + "a stranded gate leaves the projected Instance stuck Pending (SchedulingGatesPresent)") + } + + // Network gate (if present) must be preserved — only the Quota gate is + // cleared by InstanceReconciler; NetworkSchedulingGate is owned by + // WorkloadDeploymentReconciler. + for _, g := range updated.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.QuotaSchedulingGate.String(), g.Name, + "Quota gate must not remain after granted claim") + } + }) + } +} + +// TestReconcileQuotaSingleMode verifies that in single-cell mode: +// - the project ID is decoded from the upstream-cluster-name label on the edge +// namespace (not taken from the always-"single" ClusterName) +// - the ResourceClaim is created in the in-project namespace (upstream-namespace +// label, e.g. "default"), not in the edge namespace (ns-abc123) +// - the ResourceRef points at resourcemanager.miloapis.com/Project, not Instance +func TestReconcileQuotaSingleMode(t *testing.T) { + const ( + instanceName = "my-instance" + edgeNS = "ns-abc123" // edge namespace (ns-{uid}) — does NOT exist in project CP + projectID = "datum-cloud" // decoded from "cluster-datum-cloud" + projectNS = "default" // upstream-namespace label value — where claims live + deploymentName = "my-deployment" + ) + + // Claim name uses the edge namespace prefix (stable identifier for the claim) + // but the claim object itself lives in projectNS. + claimName := edgeNS + "--" + instanceName + + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: edgeNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: edgeNS, UID: "test-uid"}, + } + + // ResourceClaim lives in projectNS ("default"), not edgeNS ("ns-abc123"). + // ResourceRef points at the Project resource, matching the ResourceRegistration's + // claimingResources (resourcemanager.miloapis.com/Project only). + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: projectNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + // The quota client is keyed by projectID ("datum-cloud"), matching what + // projectIDForInstance returns after decoding "cluster-datum-cloud". + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(projectID, quotaClient) + + const singleCluster = "single" + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + singleCluster: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: singleCluster, + // Single-cell mode: project ID decoded from upstream-cluster-name label. + // Simulates what cmd/main.go does for "cluster-datum-cloud" → "datum-cloud". + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectID, nil + }, + // Single-cell mode: claim namespace comes from upstream-namespace label. + // Simulates what cmd/main.go does by reading the edge namespace labels. + projectNamespaceForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectNS, nil + }, + // Single-cell mode: watch map func must always return "single". + clusterNameForProject: func(_ string) multicluster.ClusterName { + return singleCluster + }, + } + + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: edgeNS, Name: instanceName}}, + ClusterName: singleCluster, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: edgeNS, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be set") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status, "quota should be granted in single mode") + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Verify clusterNameForProject always returns "single" so the watch map func + // never enqueues an unknown cluster name. + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject(projectID)) + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject("any-other-project")) + + // Verify resolveProjectNamespace returns the in-project namespace, not the edge namespace. + resolvedNS, resolveErr := r.resolveProjectNamespace(context.Background(), singleCluster, instance) + require.NoError(t, resolveErr) + assert.Equal(t, projectNS, resolvedNS, "claim namespace must be the in-project namespace, not the edge namespace") +} + +// TestReconcileQuotaFailureModes verifies that infrastructure failures in the +// quota path set specific QuotaGranted=False conditions (fail-closed) rather +// than silently allowing workloads to schedule. +func TestReconcileQuotaFailureModes(t *testing.T) { + const ( + testProject = "test-project" + testNS = "default" + testInstance = "my-instance" + testDeployment = "my-deployment" + ) + + makeInstance := func() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstance, + Namespace: testNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: testDeployment, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + } + + makeDeployment := func() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testDeployment, Namespace: testNS, UID: testUIDString}, + } + } + + newReconcilerWithInterceptor := func( + t *testing.T, + funcs interceptor.Funcs, + fakeRecorder *record.FakeRecorder, + ) (*InstanceReconciler, client.Client) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(funcs). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient + } + + reconcileReq := func() mcreconcile.Request { + return mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: testNS, Name: testInstance}}, + ClusterName: testProject, + } + } + + t.Run("FM-2: backend unreachable sets QuotaBackendUnavailable", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return fmt.Errorf("connection refused") + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + // Reconcile returns error for transient failures. + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, cond.Reason) + + // Event should have been emitted. + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable) + default: + t.Error("expected a Warning event for backend unavailable, got none") + } + }) + + // FM-4/FM-5: 404 on Create maps to NamespaceNotFound when the claim namespace + // is known (the more common case for project-exists-but-namespace-absent), and + // to ProjectNotFound when the namespace itself is empty (project CP path missing). + t.Run("FM-5: 404 on Create with known namespace sets QuotaNamespaceNotFound", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + notFoundErr := apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return notFoundErr + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return notFoundErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // claimNamespace == testNS (non-empty) → NamespaceNotFound, not ProjectNotFound. + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound, cond.Reason, + "404 on Create with known namespace should map to NamespaceNotFound") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound) + default: + t.Error("expected a Warning event for namespace not found, got none") + } + }) + + t.Run("FM-6: 403 on Create sets QuotaMisconfigured", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + forbiddenErr := apierrors.NewForbidden( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim", + fmt.Errorf("ResourceRegistration not found")) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return forbiddenErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonMisconfigured, cond.Reason, + "403 on Create should map to Misconfigured") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonMisconfigured) + default: + t.Error("expected a Warning event for misconfigured quota, got none") + } + }) + + t.Run("FM-7: claim pending with no budget sets QuotaNoBudget", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + claimName := testNS + "--" + testInstance + pendingClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionFalse, + Reason: quotav1alpha1.ResourceClaimPendingReason, + Message: "No AllowanceBucket configured", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(pendingClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err, "pending-no-budget is not a transient error — no requeue needed") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNoBudget, cond.Reason, + "pending claim with no budget should use NoBudget reason, not PendingEvaluation") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNoBudget) + default: + t.Error("expected a Warning event for no budget, got none") + } + }) + + t.Run("quota disabled: quotaClientManager nil sets QuotaDisabled (not QuotaAvailable)", func(t *testing.T) { + s := newTestScheme(t) + instance := makeInstance() + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: nil, // explicitly disabled + edgeClusterName: testEdgeClusterName, + recorder: record.NewFakeRecorder(10), + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, cond.Reason, + "intentionally disabled quota should use QuotaDisabled reason") + }) + + t.Run("observedGeneration guard: stale True condition does not remove gate for new generation", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + // Instance at generation 2 with a stale QuotaGranted=True from generation 1. + instance := makeInstance() + instance.Generation = 2 + instance.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "quota granted (generation 1)", + ObservedGeneration: 1, // stale — does not match instance.Generation=2 + LastTransitionTime: metav1.Now(), + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + claimName := testNS + "--" + testInstance + grantedClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(grantedClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with + // ObservedGeneration=2 into the in-memory instance, status is persisted, + // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // instance.Generation=2) and removes the gate — all in one pass. + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + hasGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be removed in the same reconcile that refreshes the condition to current generation") + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition must reflect current generation") + }) +} diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 00000000..fa0b69b6 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the upstream +// Karmada/management control plane by POP-cell InstanceReconcilers and creates +// read-only projections in the corresponding project namespace within each +// project cluster. +// +// Namespace resolution: an upstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// upstream Karmada control plane — NOT the multicluster-runtime manager — so +// informer watches are scoped to the upstream control plane. +type InstanceProjector struct { + // FederationClient reads Instance objects from the Karmada federation control + // plane (configured via --federation-kubeconfig). Must be set before + // SetupWithManager is called. + FederationClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + // 1. Fetch the Instance from the upstream Karmada control plane. + var downstreamInstance computev1alpha.Instance + if err := r.FederationClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the upstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting upstream instance: %w", err) + } + + // Only project Instances that carry the upstream tracking label; others were + // not written by our InstanceReconciler write-back logic. + encodedClusterName, ok := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + logger.V(1).Info("skipping instance without upstream cluster label") + return ctrl.Result{}, nil + } + + // 2. Resolve the project cluster name. + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 3. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 4. Resolve the target project namespace from the Instance label. + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the upstream Karmada namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + logger.Info("Instance missing upstream-namespace label, requeueing", + "namespace", downstreamInstance.Namespace, "name", downstreamInstance.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 5. Resolve the owning WorkloadDeployment by NAME in the project cluster. + // + // Core invariant: the ownerReference MUST be built from a project-cluster + // object obtained via projectClient.Get — never from any edge/Karmada + // identity. The WD name is stable across all planes (project cluster, Karmada, + // edge) and is the correct cross-plane identifier. + // + // Resolution order: + // a) Read WorkloadDeploymentNameLabel from the downstream Instance (stamped by + // the edge stateful control strategy). + // b) If absent (Instances created before the label was introduced), fall back + // to stripping the trailing "-" suffix from the Instance name. + wdName := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if wdName == "" { + wdName = wdNameFromInstanceName(downstreamInstance.Name) + } + if wdName == "" { + logger.Info("cannot resolve WorkloadDeployment name from Instance — skipping projection", + "instance", downstreamInstance.Name) + return ctrl.Result{}, nil + } + + // Fetch the project-cluster WD directly by name. The returned object carries + // the project-cluster metadata.uid — the only UID that GC in the project + // cluster can act on. + var ownerWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: targetNamespace, Name: wdName}, &ownerWD); err != nil { + if apierrors.IsNotFound(err) { + // Either a transient ordering race (Instance projected before + // WorkloadReconciler created the project WD) or the WD has been + // deleted. In both cases, do NOT create an ownerless projection. + // Requeue so the projection is created with a correct owner + // reference once the WD exists. The 5 s interval matches the + // existing upstream-namespace label requeue above. + logger.Info("project WorkloadDeployment not found — requeueing without creating projection", + "wdName", wdName, "namespace", targetNamespace) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting WorkloadDeployment %s/%s in project cluster %s: %w", + targetNamespace, wdName, clusterName, err) + } + + // 6. Create or update the projection in the project namespace. + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference using the live project-cluster WD object. + // controllerutil.SetOwnerReference reads UID and GVK from ownerWD, which + // was fetched from projectClient — satisfying the core invariant. + return controllerutil.SetOwnerReference(&ownerWD, projection, projectCluster.GetScheme()) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// wdNameFromInstanceName derives the WorkloadDeployment name from an Instance +// name by stripping the trailing "-" suffix. Instance names follow the +// convention "-" (e.g. "my-api-default-dfw-0"), which is +// structurally enforced by the stateful control strategy. Returns empty string +// if the name does not contain a numeric suffix (unrecognised format). +// +// This is used as a fallback when the WorkloadDeploymentNameLabel is absent on +// Instances created before that label was introduced. +func wdNameFromInstanceName(name string) string { + lastDash := strings.LastIndex(name, "-") + if lastDash <= 0 { + return "" + } + suffix := name[lastDash+1:] + for _, c := range suffix { + if c < '0' || c > '9' { + return "" + } + } + if len(suffix) == 0 { + return "" + } + return name[:lastDash] +} + +// SetupWithManager registers the InstanceProjector with upstreamMgr, a standard +// manager.Manager configured against the upstream Karmada/federation control plane +// REST config. FederationClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(upstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(upstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 00000000..7dcc8168 --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "maps" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + // Follows the "-" convention: "my-wd-0". + projTestInstanceName = "my-wd-0" + + // projTestWDUID is the UID of the owning WorkloadDeployment as it exists in + // the PROJECT cluster. This is the UID that owner references must use, since + // Kubernetes GC in the project cluster only knows this UID. + projTestWDUID = types.UID("project-wd-uid-9999-aaaa-bbbb-cccc") + + // projTestEdgeWDUID is the UID of the WorkloadDeployment as it exists on the + // EDGE/Karmada plane. Each plane mints its own UID, so this is intentionally + // distinct from projTestWDUID. The WorkloadDeploymentUIDLabel on downstream + // Instances carries this edge UID — NOT the project UID. + projTestEdgeWDUID = types.UID("edge-uid-0000-1111-2222-3333") + + // projTestWDName is the name of the owning WorkloadDeployment. The name is + // the same across all planes (project cluster, Karmada, edge) and is the + // correct cross-plane stable identifier. + projTestWDName = "my-wd" + + // projTestWorkloadUID is the UID of the owning Workload (carried via WorkloadUIDLabel). + projTestWorkloadUID = "wl-uid-1111-2222-3333-4444" + + // projTestInstanceIndex is the ordinal index of the instance (carried via InstanceIndexLabel). + projTestInstanceIndex = "0" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // WorkloadDeploymentUIDLabel carries the EDGE UID — intentionally distinct + // from projTestWDUID (the project-cluster WD UID). Owner references must + // never be built from this value. + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestEdgeWDUID), + computev1alpha.WorkloadDeploymentNameLabel: projTestWDName, + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + } + maps.Copy(labels, labelOverrides) + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + FederationClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // request overrides the default projectorRequest() when set. + request *ctrl.Request + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantRequeue controls whether the reconcile result should request a requeue. + wantRequeue bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Cross-plane UID regression test: the Karmada Instance carries the EDGE + // WD UID in WorkloadDeploymentUIDLabel (projTestEdgeWDUID), which is + // intentionally different from the project-cluster WD UID (projTestWDUID). + // The owner reference on the projection must use the project-cluster UID. + // This test fails if someone reintroduces UID-based matching against the + // edge/Karmada plane. + name: "WD name label present, edge UID differs from project UID — owner ref UID equals project WD UID", + karmadaInstance: projTestKarmadaInstance(nil), // carries projTestEdgeWDUID, not projTestWDUID + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), // UID is projTestWDUID + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Fallback: when WorkloadDeploymentNameLabel is absent (Instances created + // before the label was introduced), the projector derives the WD name from + // the Instance name by stripping the trailing "-" suffix. + // Instance name "my-wd-0" → WD name "my-wd". + name: "WD name label absent, fallback name extraction from instance name — owner ref attached", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Remove the name label to exercise the fallback path. + computev1alpha.WorkloadDeploymentNameLabel: "", + }), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // NotFound requeue: when the project WD does not yet exist (transient + // ordering race — Instance projected before WorkloadReconciler created + // the project WD), the projector must requeue and NOT create an ownerless + // projection. A projection must never be created without an owner reference. + name: "project WD not found — requeue, no ownerless projection created", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment — simulates the transient ordering race. + }, + wantProjection: false, + wantRequeue: true, + }, + { + // Unresolvable WD name: both the label is absent and the Instance name has + // no numeric suffix to strip (unrecognised naming format). The projector + // should skip without error — no projection created, no requeue. + // The instance name "inst-no-ordinal" has no trailing numeric segment. + name: "WD name label absent and instance name yields no resolvable WD — skip, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // No WorkloadDeploymentNameLabel — no label, no numeric suffix. + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + }, + }, + }, + request: &ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: false, + }, + { + name: "missing upstream-cluster-name label — skipped, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + name: "missing upstream-namespace label — requeue", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + // Verify that all linking labels (WorkloadUID, WorkloadDeploymentUID, + // WorkloadDeploymentNameLabel, InstanceIndex) survive from the Karmada + // write-back object through to the projection. + name: "all linking labels propagated from Karmada to projection", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Build Karmada client. + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + // Build project client. + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + req := projectorRequest() + if tt.request != nil { + req = *tt.request + } + result, err := r.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + + if tt.wantRequeue { + assert.NotZero(t, result.RequeueAfter, "expected RequeueAfter to be set") + } else { + assert.Equal(t, ctrl.Result{}, result) + } + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Linking labels must survive from the Karmada instance to the projection + // so that the CLI can resolve Workload name, city, and instance ordinal. + if tt.wantProjection && tt.karmadaInstance != nil { + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], + projection.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.InstanceIndexLabel], + projection.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated to the projection") + } + + // Owner reference check. + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + // Core invariant: owner ref UID must be the PROJECT-cluster WD UID. + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID must match the project-cluster WorkloadDeployment UID") + // Regression guard: the edge UID must NOT appear in the owner ref. + // If this assertion fails, someone reintroduced cross-plane UID matching. + assert.NotEqual(t, string(projTestEdgeWDUID), string(ownerRef.UID), + "owner reference UID must NOT be the edge/Karmada WD UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference when WD not found") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true when err is a Kubernetes not-found error or is nil +// (object not found means Get returned NotFound, not that err is nil). +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + // Import apierrors to check — we already have it via the fake client package. + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go new file mode 100644 index 00000000..0112a630 --- /dev/null +++ b/internal/controller/instance_writeback_test.go @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "sync" + "testing" + + "github.com/go-logr/logr" + "github.com/go-logr/logr/funcr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Log capture helper ─────────────────────────────────────────────────────── + +// logEntry holds a single captured log line (message + formatted key-value pairs). +type logEntry struct { + msg string + kvs string // funcr renders key-value pairs as a single string +} + +// captureLogger returns a logr.Logger backed by an in-memory sink and a pointer +// to the slice of captured entries. Thread-safe; safe to call from parallel tests. +func captureLogger() (logr.Logger, *[]logEntry) { + var mu sync.Mutex + var entries []logEntry + logger := funcr.New(func(prefix, args string) { + mu.Lock() + defer mu.Unlock() + entries = append(entries, logEntry{msg: prefix, kvs: args}) + }, funcr.Options{}) + return logger, &entries +} + +// ─── write-back test constants ──────────────────────────────────────────────── + +const ( + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + + // Four new self-describing labels. + wbTestWDName = "my-workload-deployment" + wbTestCityCode = "DFW" + wbTestWorkloadName = "my-workload" + wbTestPlacement = "us-central" +) + +// wbTestCellInstance builds a cell-side Instance with all seven owned labels +// pre-populated, as addInstanceControllerLabels would produce. +func wbTestCellInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + computev1alpha.WorkloadDeploymentNameLabel: wbTestWDName, + computev1alpha.CityCodeLabel: wbTestCityCode, + computev1alpha.WorkloadNameLabel: wbTestWorkloadName, + computev1alpha.PlacementNameLabel: wbTestPlacement, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: "Instance is ready", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + +// wbTestDownstreamNS returns a Namespace object in the downstream (Karmada) +// control plane that carries the upstream routing labels, simulating the +// namespace stamped by NSO's MappedNamespaceResourceStrategy. +func wbTestDownstreamNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + }, + }, + } +} + +// newWriteBackReconciler wires an InstanceReconciler whose FederationClient is set +// to federationClient and whose local cluster has a single cell instance. +func newWriteBackReconciler(federationClient client.Client) *InstanceReconciler { + return &InstanceReconciler{ + FederationClient: federationClient, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestWriteBackToUpstream_CreatePath_AllLabels (Case A) verifies that the first +// write-back to an empty Karmada control plane creates an Instance with all five +// expected labels (two routing + three linking) and also writes the cell-side +// status via Status().Update. +func TestWriteBackToUpstream_CreatePath_AllLabels(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // Verify the created Karmada Instance carries all five expected labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestEncodedCluster, created.Labels[downstreamclient.UpstreamOwnerClusterNameLabel], + "UpstreamOwnerClusterNameLabel must be set") + assert.Equal(t, wbTestUpstreamNS, created.Labels[downstreamclient.UpstreamOwnerNamespaceLabel], + "UpstreamOwnerNamespaceLabel must be set") + assert.Equal(t, wbTestWorkloadUID, created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestWDUID, created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestInstanceIndex, created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated from cell instance") + + // Status must have been written via Status().Update after Create. + require.Len(t, created.Status.Conditions, 1, + "Status().Update must be called after Create; condition should be present") + assert.Equal(t, computev1alpha.InstanceReady, created.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionTrue, created.Status.Conditions[0].Status) +} + +// TestWriteBackToUpstream_UpdatePath_LabelMerge (Case B) verifies that an +// existing Karmada Instance with a Karmada-managed label retains that label +// after the update path runs, while all five owned labels are written correctly. +func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + // Pre-populate the Karmada control plane with an Instance that has the old + // two-label map plus a simulated Karmada-managed label. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + // All five owned labels must be present with correct values. + assert.Equal(t, wbTestEncodedCluster, updated.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]) + assert.Equal(t, wbTestUpstreamNS, updated.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]) + assert.Equal(t, wbTestWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel]) + assert.Equal(t, wbTestWDUID, updated.Labels[computev1alpha.WorkloadDeploymentUIDLabel]) + assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) + + // The Karmada-managed label must survive the merge (not be replaced/deleted). + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after merge; should not be overwritten") +} + +// TestWriteBackToUpstream_LabelChangeTriggerUpdate (Case C) verifies that +// a changed linking label on the cell instance causes the Karmada object to +// be updated with the new value. +func TestWriteBackToUpstream_LabelChangeTriggerUpdate(t *testing.T) { + t.Parallel() + + newWorkloadUID := "wl-uid-CHANGED" + + // Pre-populate with the five-label map from a previous write-back. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Modify the WorkloadUIDLabel on the cell instance. + cellInstance := wbTestCellInstance() + cellInstance.Labels[computev1alpha.WorkloadUIDLabel] = newWorkloadUID + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, newWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel change on the cell instance must be reflected in the Karmada object") +} + +// TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal (Case D) verifies that +// writeBackToUpstream completes without error when the cell-side Instance has +// no linking labels (e.g. during an early reconcile before +// addInstanceControllerLabels has run). The created Karmada object will carry +// empty string values for the three linking labels, and the RC-2 warning log +// must fire listing all three missing label keys. +func TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Instance with nil Labels — simulates an early reconcile with no linking labels. + cellInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + // Inject a capturing logger so we can assert the RC-2 warning fires. + capLogger, entries := captureLogger() + ctx := log.IntoContext(context.Background(), capLogger) + + // Must not return an error — empty labels are non-fatal. + err := r.writeBackToUpstream(ctx, multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // The Karmada object should exist with empty string values for the linking labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel should be empty string when not set on cell instance") + + // Assert the RC-2 warning was emitted and named all three missing label keys. + // funcr encodes both the message and key-value pairs into the args string; + // we search across the full rendered output for each required substring. + warnMsg := "instance is missing linking labels for write-back" + allRendered := func() string { + parts := make([]string, len(*entries)) + for i, e := range *entries { + parts[i] = fmt.Sprintf("%s %s", e.msg, e.kvs) + } + return strings.Join(parts, "\n") + }() + + assert.True(t, strings.Contains(allRendered, warnMsg), + "expected RC-2 warning %q to be logged; got:\n%s", warnMsg, allRendered) + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + assert.True(t, strings.Contains(allRendered, key), + "expected missing label key %q to appear in warning log; got:\n%s", key, allRendered) + } +} + +// TestWriteBackToUpstream_FourNewLabels_CreatePath verifies that all four new +// self-describing labels (WorkloadDeploymentName, CityCode, WorkloadName, +// PlacementName) are written to the Karmada object on the create path. +func TestWriteBackToUpstream_FourNewLabels_CreatePath(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestWDName, created.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestCityCode, created.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must propagate to Karmada object") + assert.Equal(t, wbTestWorkloadName, created.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestPlacement, created.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must propagate to Karmada object") +} + +// TestWriteBackToUpstream_FourNewLabels_UpdatePath verifies that all four new +// self-describing labels are written on the update path and existing Karmada- +// managed labels on the downstream object are preserved. +func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, wbTestWDName, updated.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on update path") + assert.Equal(t, wbTestCityCode, updated.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must be set on update path") + assert.Equal(t, wbTestWorkloadName, updated.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on update path") + assert.Equal(t, wbTestPlacement, updated.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on update path") + + // Karmada-managed label must survive the merge. + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after the update merge") +} diff --git a/internal/controller/instancecontrol/instancecontrol.go b/internal/controller/instancecontrol/instancecontrol.go index 6de9df99..d2c83692 100644 --- a/internal/controller/instancecontrol/instancecontrol.go +++ b/internal/controller/instancecontrol/instancecontrol.go @@ -26,10 +26,11 @@ type Strategy interface { type ActionType string const ( - ActionTypeCreate ActionType = "Create" - ActionTypeUpdate ActionType = "Update" - ActionTypeDelete ActionType = "Delete" - ActionTypeWait ActionType = "Wait" + ActionTypeCreate ActionType = "Create" + ActionTypeUpdate ActionType = "Update" + ActionTypeDelete ActionType = "Delete" + ActionTypeWait ActionType = "Wait" + ActionTypePatchLabels ActionType = "PatchLabels" ) type Action struct { @@ -104,3 +105,22 @@ func NewWaitAction(object client.Object) Action { fn: func(ctx context.Context, c client.Client) error { return nil }, } } + +// NewPatchLabelsAction returns an action that applies a metadata-only labels +// patch to the given object. It uses a MergeFrom patch so only the labels +// field is sent to the API server — the spec, template, and template-hash are +// never touched. This is intentionally separate from ActionTypeUpdate so that +// label backfill never participates in the ordered rolling-update flow. +func NewPatchLabelsAction(updated client.Object, base client.Object) Action { + patch := client.MergeFrom(base) + return Action{ + Object: updated, + actionType: ActionTypePatchLabels, + fn: func(ctx context.Context, c client.Client) error { + if err := c.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to patch labels on %T %s: %w", updated, updated.GetName(), err) + } + return nil + }, + } +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652c..2d2e3073 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -15,13 +15,30 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" ) +// Options controls optional behaviours of the stateful instance control strategy. +type Options struct { + // NetworkingEnabled controls whether the Network scheduling gate is added to + // newly created Instances. Set to false when the networking integration is + // disabled so that Instances are not blocked waiting for a NetworkBinding. + // Defaults to true. + NetworkingEnabled bool +} + // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset // Does not currently implement exact behavior. type statefulControl struct { + opts Options } +// New returns a stateful instance control strategy with networking enabled. func New() instancecontrol.Strategy { - return &statefulControl{} + return NewWithOptions(Options{NetworkingEnabled: true}) +} + +// NewWithOptions returns a stateful instance control strategy with the given +// options. +func NewWithOptions(opts Options) instancecontrol.Strategy { + return &statefulControl{opts: opts} } func (c *statefulControl) GetActions( @@ -68,15 +85,25 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } + // Set Location best-effort: when Status.Location is nil (no matching + // Location object for the city code) Instance.Spec.Location stays nil and + // instance creation proceeds normally — this must not block scheduling. desiredInstances[i].Spec.Location = deployment.Status.Location // TODO(jreese) consider adding scheduling gates via mutating webhooks - desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ - TemplateHash: instanceTemplateHash, - SchedulingGates: []v1alpha.SchedulingGate{ + gates := []v1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + } + if c.opts.NetworkingEnabled { + // Prepend the Network gate so it is cleared first; quota is + // independent and evaluated in parallel by InstanceReconciler. + gates = append([]v1alpha.SchedulingGate{ {Name: instancecontrol.NetworkSchedulingGate.String()}, - {Name: instancecontrol.QuotaSchedulingGate.String()}, - }, + }, gates...) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ + TemplateHash: instanceTemplateHash, + SchedulingGates: gates, } addInstanceControllerLabels(desiredInstances[i], getInstanceOrdinal(desiredInstances[i].Name), deployment) @@ -114,10 +141,37 @@ func (c *statefulControl) GetActions( } } + // Backfill controller-managed labels on every existing instance, regardless + // of Ready state or template hash. This ensures newly-introduced labels + // (e.g. city-code, workload-name) are applied to pre-existing instances that + // were never touched by a rolling update. The patch is metadata-only and is + // emitted outside the ordered rollout decision so it never gates or reorders + // instance creation/updates. + var patchLabelActions []instancecontrol.Action + for _, instance := range desiredInstances { + if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { + // Skip instances that don't exist yet or are being deleted. + continue + } + + desiredLabels := desiredControllerLabels(getInstanceOrdinal(instance.Name), deployment) + if labelsNeedBackfill(instance.Labels, desiredLabels) { + base := instance.DeepCopy() + patched := instance.DeepCopy() + for k, v := range desiredLabels { + if patched.Labels == nil { + patched.Labels = make(map[string]string) + } + patched.Labels[k] = v + } + patchLabelActions = append(patchLabelActions, instancecontrol.NewPatchLabelsAction(patched, base)) + } + } + slices.SortFunc(updateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -144,6 +198,11 @@ func (c *statefulControl) GetActions( } + // Label-backfill actions are appended after the rollout ordering/skip logic + // so they are never affected by the "skip all but first" rule and never + // participate in rollout sequencing. + actions = append(actions, patchLabelActions...) + return actions, nil } @@ -152,7 +211,37 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme instance.Labels = map[string]string{} } - instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(index) - instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) - instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + for k, v := range desiredControllerLabels(index, deployment) { + instance.Labels[k] = v + } +} + +// desiredControllerLabels returns the full set of controller-managed labels +// that every instance should carry. Used both when stamping a new/updated +// instance and when checking whether an existing instance needs a backfill +// patch. +func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { + return map[string]string{ + v1alpha.InstanceIndexLabel: strconv.Itoa(index), + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + // Self-describing labels for routing, filtering, and observability. + // Backfilled on every reconcile so they stay accurate even for instances + // that pre-date the labels or that were not reached by a rolling update. + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } +} + +// labelsNeedBackfill reports whether any of the desired controller-managed +// label key/value pairs are absent or incorrect on the current instance labels. +func labelsNeedBackfill(current map[string]string, desired map[string]string) bool { + for k, v := range desired { + if current[k] != v { + return true + } + } + return false } diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d45b24b3..4f88eb8e 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -13,6 +13,8 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" ) @@ -53,7 +55,7 @@ func TestUpdateWithAllReadyInstances(t *testing.T) { deployment := getWorkloadDeployment("test-deploy", 2) - var currentInstances []v1alpha.Instance + currentInstances := make([]v1alpha.Instance, 0, 2) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 0)) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 1)) @@ -79,7 +81,7 @@ func TestScaleUpWithNotReadyInstance(t *testing.T) { deployment := getWorkloadDeployment("test-deploy", 3) - var currentInstances []v1alpha.Instance + currentInstances := make([]v1alpha.Instance, 0, 2) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 0)) notReadyInstance := getInstanceForDeployment(deployment, 1) @@ -109,7 +111,7 @@ func TestScaleUpWithDeletingReadyInstance(t *testing.T) { deployment := getWorkloadDeployment("test-deploy", 3) - var currentInstances []v1alpha.Instance + currentInstances := make([]v1alpha.Instance, 0, 2) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 0)) deletingInstance := getInstanceForDeployment(deployment, 1) @@ -136,7 +138,7 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { deployment := getWorkloadDeployment("test-deploy", 1) - var currentInstances []v1alpha.Instance + currentInstances := make([]v1alpha.Instance, 0, 2) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 0)) currentInstances = append(currentInstances, *getInstanceForDeployment(deployment, 1)) @@ -150,16 +152,407 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { assert.False(t, actions[0].IsSkipped()) } +// TestNetworkingEnabledAddsNetworkGate verifies that when networking is enabled +// (the default), newly created Instances receive both the Network and Quota +// scheduling gates so that they are held until the network is provisioned. +func TestNetworkingEnabledAddsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: true}) + + deployment := getWorkloadDeployment("test-deploy-net-on", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.Contains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must be present when networking is enabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must be present") +} + +// TestNetworkingDisabledOmitsNetworkGate verifies that when networking is +// disabled, newly created Instances do NOT receive the Network scheduling gate, +// so they are not blocked on network provisioning. The Quota gate is still +// added so quota enforcement remains active. +func TestNetworkingDisabledOmitsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: false}) + + deployment := getWorkloadDeployment("test-deploy-net-off", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.NotContains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must NOT be present when networking is disabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must still be present when networking is disabled") +} + // Add more test functions below for different scenarios. +// TestInstanceLabels_FourNewLabelsStamped verifies that all four new +// self-describing labels are stamped on newly created Instances, with values +// sourced from the WorkloadDeployment spec. +func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-deploy", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must equal deployment name") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must equal deployment.Spec.CityCode") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must equal deployment.Spec.WorkloadRef.Name") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must equal deployment.Spec.PlacementName") +} + +// TestInstanceLabels_PropagatedOnUpdate verifies that when an existing instance +// is updated (rolling update path), the four new labels are refreshed from the +// deployment so they remain accurate after spec changes. +func TestInstanceLabels_PropagatedOnUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-update", 1) + + // Build a ready existing instance. + currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} + + // Trigger a rolling update by changing the image. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must be refreshed on update") +} + +// TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when +// deployment.Status.Location is set, the new Instance receives it as Spec.Location. +func TestInstanceLocation_SetWhenDeploymentStatusLocationPresent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-set", 1) + deployment.Status.Location = &networkingv1alpha.LocationReference{ + Name: "loc-dfw-1", + Namespace: "networking-system", + } + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Location, + "Spec.Location must be set when deployment.Status.Location is non-nil") + assert.Equal(t, "loc-dfw-1", instance.Spec.Location.Name) + assert.Equal(t, "networking-system", instance.Spec.Location.Namespace) +} + +// TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent verifies that when +// deployment.Status.Location is nil (no Location object matches the city code), +// instance creation still succeeds and Spec.Location remains nil — no regression +// on the "create instances regardless of Location" contract. +func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-nil", 1) + // deployment.Status.Location is intentionally not set (nil) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err, "instance creation must succeed even when Status.Location is nil") + assert.Len(t, actions, 1, "exactly one create action must be produced") + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Nil(t, instance.Spec.Location, + "Spec.Location must remain nil when deployment.Status.Location is not set") + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType(), + "action must be a Create, proving instance creation is not gated on Location") +} + +// TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance +// with an unchanged template hash receives a PatchLabels action when it is +// missing controller-managed labels. The action must not be a rollout Update, +// must not alter spec/template, and must not block subsequent instances. +func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-notready", 2) + + // Instance 0: not-Ready, correct template hash, but missing city-code/workload-name labels. + instance0 := getInstanceForDeployment(deployment, 0) + apimeta.SetStatusCondition(&instance0.Status.Conditions, metav1.Condition{ + Type: v1alpha.InstanceReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + Message: "Instance is not ready", + LastTransitionTime: metav1.Now(), + }) + // Simulate pre-existing instance that only has the index label (missing the newer labels). + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + } + + // Instance 1: needs to be created (nil in desiredInstances), so we only provide instance0. + currentInstances := []v1alpha.Instance{*instance0} + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + // Collect actions by type. + var waitActions, createActions, updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeWait: + waitActions = append(waitActions, a) + case instancecontrol.ActionTypeCreate: + createActions = append(createActions, a) + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // The not-Ready instance must still produce a Wait (rollout is gated). + assert.Len(t, waitActions, 1, "not-Ready instance must still produce a Wait action") + assert.Equal(t, "test-backfill-notready-0", waitActions[0].Object.GetName()) + + // The missing instance-1 create is skipped (ordered policy, Wait is first). + assert.Len(t, createActions, 1, "instance-1 create action must be present") + assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") + + // No template Update actions must be produced. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash instance") + + // A PatchLabels action must be produced for instance-0. + assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") + assert.Equal(t, "test-backfill-notready-0", patchActions[0].Object.GetName()) + assert.False(t, patchActions[0].IsSkipped(), "PatchLabels must not be skipped by the rollout skip-loop") + + // The patched object must carry all desired labels. + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.GetName(), patched.Labels[v1alpha.WorkloadDeploymentNameLabel]) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel]) + assert.Equal(t, deployment.Spec.WorkloadRef.Name, patched.Labels[v1alpha.WorkloadNameLabel]) + assert.Equal(t, deployment.Spec.PlacementName, patched.Labels[v1alpha.PlacementNameLabel]) + + // The patched object's spec and template-hash must be unchanged. + assert.Equal(t, instancecontrol.ComputeHash(deployment.Spec.Template), patched.Spec.Controller.TemplateHash, + "template hash must be unchanged by the label backfill") + assert.Equal(t, deployment.Spec.Template.Spec.Runtime, patched.Spec.Runtime, + "spec must be unchanged by the label backfill") +} + +// TestLabelBackfill_Idempotent verifies that an instance already carrying all +// correct controller-managed labels produces no PatchLabels action. +func TestLabelBackfill_Idempotent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-idempotent", 1) + + // Instance already has all controller-managed labels set correctly. + instance := getInstanceForDeployment(deployment, 0) + instance.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + for _, a := range actions { + assert.NotEqual(t, instancecontrol.ActionTypePatchLabels, a.ActionType(), + "no PatchLabels action must be produced when all labels are already correct") + } +} + +// TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with +// correct template hash but drifted labels receives a PatchLabels action +// without triggering a template rollout Update. +func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-ready", 1) + + // Ready instance with matching hash but missing city-code label. + instance := getInstanceForDeployment(deployment, 0) + // Remove the city-code label to simulate drift. + delete(instance.Labels, v1alpha.CityCodeLabel) + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // No template Update must be produced — template hash matches. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash ready instance") + + // A PatchLabels action must be produced. + assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel], + "city-code label must be corrected by the backfill") +} + +// TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template +// change on a Ready instance still produces a normal ordered Update action and +// that the PatchLabels path does not interfere with or duplicate it. +func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-rolling", 2) + + // Two ready instances with all correct labels and matching current hash. + instance0 := getInstanceForDeployment(deployment, 0) + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + instance1 := getInstanceForDeployment(deployment, 1) + instance1.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "1", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + // Trigger a template change. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "rolling-update-image" + + currentInstances := []v1alpha.Instance{*instance0, *instance1} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // Two Update actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, updateActions, 2, "both instances must produce Update actions on template change") + assert.Equal(t, "test-backfill-rolling-1", updateActions[0].Object.GetName(), + "Update actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", updateActions[1].Object.GetName()) + assert.False(t, updateActions[0].IsSkipped(), "first Update must be active") + assert.True(t, updateActions[1].IsSkipped(), "second Update must be skipped (ordered rollout)") + + // No PatchLabels — all labels are already correct. + assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") +} + func getWorkloadDeployment(name string, minReplicas int32) *v1alpha.WorkloadDeployment { instance := getInstanceTemplate(name, 0) deployment := &v1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", + UID: "test-wd-uid", }, Spec: v1alpha.WorkloadDeploymentSpec{ + WorkloadRef: v1alpha.WorkloadReference{ + Name: "test-workload", + UID: "test-workload-uid", + }, + PlacementName: "test-placement", + CityCode: "DFW", ScaleSettings: v1alpha.HorizontalScaleSettings{ MinReplicas: minReplicas, InstanceManagementPolicy: v1alpha.OrderedReadyInstanceManagementPolicyType, @@ -180,6 +573,20 @@ func getInstanceForDeployment(deployment *v1alpha.WorkloadDeployment, ordinal in TemplateHash: instancecontrol.ComputeHash(deployment.Spec.Template), } + // Stamp all controller-managed labels so that the label-backfill path is a + // no-op for instances built by this helper. Tests that specifically exercise + // label drift should manipulate the labels directly after calling this helper. + if instance.Labels == nil { + instance.Labels = map[string]string{} + } + instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(ordinal) + instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) + instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + instance.Labels[v1alpha.WorkloadDeploymentNameLabel] = deployment.GetName() + instance.Labels[v1alpha.CityCodeLabel] = deployment.Spec.CityCode + instance.Labels[v1alpha.WorkloadNameLabel] = deployment.Spec.WorkloadRef.Name + instance.Labels[v1alpha.PlacementNameLabel] = deployment.Spec.PlacementName + return instance } diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 00000000..cc3d3d9f --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name multicluster.ClusterName) (cluster.Cluster, error) { + if c, ok := m.clusters[string(name)]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6e907b65..cad13c00 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -26,13 +26,20 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) -const workloadControllerFinalizer = "compute.datumapis.com/workload-controller" +const ( + workloadControllerFinalizer = "compute.datumapis.com/workload-controller" + workloadConditionTypeAvailable = "Available" +) + +// conditionAvailable is the condition type used to indicate resource availability. +const conditionAvailable = "Available" // WorkloadReconciler reconciles a Workload object type WorkloadReconciler struct { @@ -118,7 +125,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ if len(notFoundNetworks) > 0 { missingNetworks := strings.Join(notFoundNetworks.UnsortedList(), ", ") changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, Reason: "NetworkNotFound", Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), @@ -238,7 +245,7 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( } placementAvailableCondition := metav1.Condition{ - Type: "Available", + Type: conditionAvailable, Status: metav1.ConditionFalse, Reason: "NoAvailableDeployments", Message: "No available deployments were found for the placement", @@ -256,7 +263,7 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( desiredReplicas += deployment.Status.DesiredReplicas readyReplicas += deployment.Status.ReadyReplicas - if apimeta.IsStatusConditionTrue(deployment.Status.Conditions, "Available") { + if apimeta.IsStatusConditionTrue(deployment.Status.Conditions, conditionAvailable) { foundAvailableDeployment = true } } @@ -283,7 +290,7 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( } availableCondition := metav1.Condition{ - Type: "Available", + Type: conditionAvailable, Status: metav1.ConditionFalse, Reason: "NoAvailablePlacements", Message: "No available placements were found for the workload", @@ -383,9 +390,9 @@ func (r *WorkloadReconciler) getDeploymentsForWorkload( existingDeployments.Insert(deployment.Name) } - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := upstreamClient.List(ctx, &locations); err != nil { - return nil, nil, fmt.Errorf("failed to list locations: %w", err) + return nil, nil, fmt.Errorf("failed to list location bindings: %w", err) } if len(locations.Items) == 0 { @@ -463,7 +470,7 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Workload{}, mcbuilder.WithEngageWithLocalCluster(false)). Owns(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Watches(&networkingv1alpha.Network{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + Watches(&networkingv1alpha.Network{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, network client.Object) []mcreconcile.Request { logger := log.FromContext(ctx) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef0..9b17266e 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -24,6 +24,7 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" @@ -37,11 +38,28 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + // KarmadaClient is an optional client pointing at the Karmada control plane. + // When non-nil, the reconciler writes the WorkloadDeployment status back to + // the Karmada namespace after each reconcile so the WorkloadDeploymentFederator + // can aggregate it into the project-namespace object. Set to nil to disable. + KarmadaClient client.Client + + // NetworkingEnabled controls whether the networking integration with + // network-services-operator is active. When false, NetworkBinding creation is + // skipped, the Network scheduling gate is never added to Instances (and is + // actively removed if present), and the networking step is treated as + // immediately ready. Defaults to true. + NetworkingEnabled bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=locations,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkbindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkcontexts,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnetclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnets,verbs=get;list;watch func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) @@ -86,10 +104,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -100,7 +114,9 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, fmt.Errorf("failed listing instances: %w", err) } - instanceControl := instancecontrolstateful.New() + instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ + NetworkingEnabled: r.NetworkingEnabled, + }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) if err != nil { @@ -122,9 +138,26 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } } - networkReady, err := r.reconcileNetworks(ctx, cl.GetClient(), &deployment) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + // When networking is disabled, bypass the entire network provisioning path. + // The Network scheduling gate is treated as cleared and no NetworkBindings + // are created. This lets Instances reach the runtime on cells where + // network-services-operator (VPC) is not yet available. + var networkReady bool + if !r.NetworkingEnabled { + networkReady = true + } else { + var resolvedLocation *networkingv1alpha.LocationReference + networkReady, resolvedLocation, err = r.reconcileNetworks(ctx, cl.GetClient(), &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + } + // Persist the resolved Location to status so downstream components (e.g. + // the stateful instance control strategy) can propagate it to Instances. + // When no matching Location exists, resolvedLocation is nil and + // Status.Location remains nil — instance creation is not blocked. + if resolvedLocation != nil { + deployment.Status.Location = resolvedLocation + } } // Networks are all ready with subnets ready to use, remove any scheduling @@ -143,59 +176,59 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + if err := r.writeStatusToKarmada(ctx, &deployment); err != nil { + return ctrl.Result{}, err + } + + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -240,13 +273,70 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// writeStatusToKarmada copies the WorkloadDeployment status to the matching +// object in the Karmada namespace so the WorkloadDeploymentFederator can +// sync it back to the project-namespace object on the control plane. +// It is a no-op when KarmadaClient is nil. +func (r *WorkloadDeploymentReconciler) writeStatusToKarmada(ctx context.Context, deployment *computev1alpha.WorkloadDeployment) error { + if r.KarmadaClient == nil { + return nil + } + + var kd computev1alpha.WorkloadDeployment + if err := r.KarmadaClient.Get(ctx, client.ObjectKeyFromObject(deployment), &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed getting Karmada WD for status writeback: %w", err) + } + + kd.Status = deployment.Status + // Use Update (not Patch) so all required status fields are present in the + // request body; MergeFrom omits unchanged zero-value int32 fields which + // would fail the CRD's required constraints on currentReplicas/readyReplicas. + if err := r.KarmadaClient.Status().Update(ctx, &kd); err != nil { + return fmt.Errorf("failed updating Karmada WD status: %w", err) + } + + return nil +} + +// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all +// network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). +// resolvedLocation is non-nil when a Location matching the deployment's city code +// was found; nil otherwise. Instance creation is never gated on resolvedLocation +// being non-nil — callers must treat a nil location as best-effort only. func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, deployment *computev1alpha.WorkloadDeployment, -) (bool, error) { +) (bool, *networkingv1alpha.LocationReference, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, nil, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + logger.Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -260,7 +350,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkBindingObjectKey, &networkBinding); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network binding: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network binding: %w", err) } if networkBinding.CreationTimestamp.IsZero() { @@ -271,16 +361,16 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetControllerReference(deployment, &networkBinding, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on network binding: %w", err) + return false, nil, fmt.Errorf("failed to set controller on network binding: %w", err) } if err := c.Create(ctx, &networkBinding); err != nil { - return false, fmt.Errorf("failed creating network binding: %w", err) + return false, nil, fmt.Errorf("failed creating network binding: %w", err) } } @@ -293,7 +383,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( if !allNetworkBindingsReady { logger.Info("waiting for network bindings to be ready") - return false, nil + return false, locationRef, nil } // TODO(jreese): Currently this makes a SubnetClaim that will be used by @@ -312,12 +402,12 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkContextObjectKey, &networkContext); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network context: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network context: %w", err) } if !apimeta.IsStatusConditionTrue(networkContext.Status.Conditions, networkingv1alpha.NetworkContextReady) { logger.Info("waiting for network context to be ready", "network_context", networkContext.Name) - return false, nil + return false, locationRef, nil } var subnetClaims networkingv1alpha.SubnetClaimList @@ -326,7 +416,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.List(ctx, &subnetClaims, listOpts...); err != nil { - return false, fmt.Errorf("failed listing subnet claims: %w", err) + return false, nil, fmt.Errorf("failed listing subnet claims: %w", err) } var subnetClaim networkingv1alpha.SubnetClaim @@ -347,8 +437,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,28 +461,28 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetOwnerReference(&networkContext, &subnetClaim, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on subnet claim: %w", err) + return false, nil, fmt.Errorf("failed to set controller on subnet claim: %w", err) } if err := c.Create(ctx, &subnetClaim); err != nil { - return false, fmt.Errorf("failed creating subnet claim: %w", err) + return false, nil, fmt.Errorf("failed creating subnet claim: %w", err) } logger.Info("created subnet claim", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } logger.Info("found subnet claim", "subnetClaim", subnetClaim.Name) if !apimeta.IsStatusConditionTrue(subnetClaim.Status.Conditions, "Ready") { logger.Info("waiting for subnet claim to be ready", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } var subnet networkingv1alpha.Subnet @@ -401,19 +491,19 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( Name: subnetClaim.Status.SubnetRef.Name, } if err := c.Get(ctx, subnetObjectKey, &subnet); err != nil { - return false, fmt.Errorf("failed fetching subnet: %w", err) + return false, nil, fmt.Errorf("failed fetching subnet: %w", err) } if !apimeta.IsStatusConditionTrue(subnet.Status.Conditions, "Ready") { logger.Info("waiting for subnet to be ready", "subnet", subnet.Name) - return false, nil + return false, locationRef, nil } logger.Info("subnet is ready", "subnet", subnet.Name) } - return true, nil + return true, locationRef, nil } var errDeploymentHasInstances = errors.New("deployment has instances") @@ -468,47 +558,65 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } - return mcbuilder.ControllerManagedBy(mgr). + + b := mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Owns(&computev1alpha.Instance{}). - Owns(&networkingv1alpha.NetworkBinding{}). - Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnetClaim := o.(*networkingv1alpha.SubnetClaim) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + Owns(&computev1alpha.Instance{}) + + // Only watch networking resources when the networking integration is enabled. + // On cells without network-services-operator these watches would log spurious + // errors for missing CRDs. + if r.NetworkingEnabled { + b = b. + Owns(&networkingv1alpha.NetworkBinding{}). + Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnetClaim := o.(*networkingv1alpha.SubnetClaim) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + }) + }). + Watches(&networkingv1alpha.Subnet{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnet := o.(*networkingv1alpha.Subnet) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + }) }) - }). - Watches(&networkingv1alpha.Subnet{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnet := o.(*networkingv1alpha.Subnet) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) - }) - }). - Complete(r) + } + + return b.Complete(r) } -func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { +func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName multicluster.ClusterName, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + // Resolve the Location to find its city code, then look up WorkloadDeployments + // that target the same city via the deploymentCityCodeIndex. + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := clusterClient.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 00000000..9c736cf0 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" + + // kindWorkloadDeployment is the Kind string for WorkloadDeployment resources. + kindWorkloadDeployment = "WorkloadDeployment" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator; this logic will migrate to Milo +// once the shared library is promoted). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // FederationClient is a client pointed at the Karmada federation control + // plane (the federation hub that the management controllers read and write + // through). The caller (cmd/main.go) constructs it from --federation-kubeconfig. + FederationClient client.Client + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.FederationClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + // Using strategy.GetClient() for writes ensures the downstream namespace is + // created with UpstreamOwnerNamespaceLabel so the InstanceProjector can + // resolve the target project namespace without scanning all namespaces. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(req.ClusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + // Ensure the downstream namespace exists and carries the upstream tracking + // labels so the InstanceProjector can resolve the project namespace by label + // lookup instead of scanning all namespaces. + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, string(req.ClusterName)); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + // Lazily create the PropagationPolicy that targets clusters with the matching + // city-code label. + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + // Pull aggregated status from the downstream control plane back into the + // project namespace. + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(clusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + // Delete the downstream WorkloadDeployment. + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + // Clean up the PropagationPolicy if no other deployments with the same city + // code remain in this downstream namespace. + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.FederationClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = fmt.Sprintf("cluster-%s", strings.ReplaceAll(clusterName, "/", "_")) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: kindWorkloadDeployment, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when FederationClient is non-nil. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator"). + Complete(r) +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + // Sanitize the city code to a valid Kubernetes name: lower-case, spaces → hyphens. + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 00000000..2bd2169f --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: testDefaultPlacement, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + FederationClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoFederationClient verifies that the reconciler +// is a no-op when FederationClient is nil. +func TestWorkloadDeploymentFederator_NoFederationClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.FederationClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go new file mode 100644 index 00000000..ff996e73 --- /dev/null +++ b/internal/controller/workloaddeployment_location_test.go @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" +) + +// newNetworkingScheme returns a scheme with compute + networkingv1alpha types. +func newNetworkingScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = computev1alpha.AddToScheme(s) + _ = networkingv1alpha.AddToScheme(s) + return s +} + +// TestReconcileNetworks_PersistsLocation_WhenLocationFound verifies that when a +// Location object matching the deployment's city code exists in the cluster, the +// resolved LocationReference is returned by reconcileNetworks and can be persisted +// to deployment.Status.Location. Instance creation must not be blocked — the +// function returns networkReady=false only because no NetworkInterfaces exist on +// the deployment in this scenario (short-circuit before bindings), not because +// Location was absent. +func TestReconcileNetworks_PersistsLocation_WhenLocationFound(t *testing.T) { + t.Parallel() + + const cityCode = "DFW" + const locationName = "loc-dfw-1" + const locationNamespace = "networking-system" + + location := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{ + Name: locationName, + Namespace: locationNamespace, + }, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": cityCode, + }, + }, + } + + s := newNetworkingScheme() + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(location).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: cityCode, + // No NetworkInterfaces — the function returns false,locationRef,nil + // after the location is found but before bindings are checked. + }, + } + + r := &WorkloadDeploymentReconciler{} + _, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err) + require.NotNil(t, resolvedLocation, + "resolved location must be non-nil when a matching Location object exists") + assert.Equal(t, locationName, resolvedLocation.Name) + assert.Equal(t, locationNamespace, resolvedLocation.Namespace) + + // Simulate what the Reconcile loop does: persist resolvedLocation to Status. + deployment.Status.Location = resolvedLocation + assert.Equal(t, locationName, deployment.Status.Location.Name, + "Status.Location.Name must match the resolved Location object name") +} + +// TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound verifies that +// when no Location object in the cluster matches the deployment's city code, +// reconcileNetworks returns (false, nil, nil) — no error and no resolved +// location. The caller must treat nil location as best-effort and must NOT block +// instance creation. +func TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound(t *testing.T) { + t.Parallel() + + s := newNetworkingScheme() + // Cluster has a Location for a DIFFERENT city code. + otherLocation := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: "loc-ord-1", Namespace: "networking-system"}, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": "ORD", + }, + }, + } + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(otherLocation).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "DFW", // no matching Location + }, + } + + r := &WorkloadDeploymentReconciler{} + networkReady, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err, "missing location must not cause an error") + assert.False(t, networkReady, "network is not ready when no location is found") + assert.Nil(t, resolvedLocation, + "resolved location must be nil when no matching Location object exists") + + // Status.Location remains nil — callers must not update it in this case. + // Confirm the deployment's Status.Location is unaffected (nil → nil). + assert.Nil(t, deployment.Status.Location, + "Status.Location must remain nil when no Location matches the city code") +} diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d64..00000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} diff --git a/internal/features/features.go b/internal/features/features.go new file mode 100644 index 00000000..8db20f09 --- /dev/null +++ b/internal/features/features.go @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package features defines the feature gates for the compute operator. Feature +// gates follow the Kubernetes component-base convention: each feature is +// declared as a Feature constant, registered with a FeatureSpec that includes +// its default enablement state, and toggled at runtime via the --feature-gates +// flag exposed by the binary. +// +// Usage in cmd/main.go: +// +// features.MutableFeatureGate.AddFlag(flag.CommandLine) +// +// Usage in controllers: +// +// if features.MutableFeatureGate.Enabled(features.NetworkingIntegration) { ... } +package features + +import ( + "k8s.io/component-base/featuregate" +) + +const ( + // NetworkingIntegration controls whether the compute operator integrates with + // the network-services-operator (VPC) for NetworkBinding provisioning and the + // Network scheduling gate on Instances. + // + // When disabled: + // - No NetworkBinding objects are created. + // - The Network scheduling gate is not added to newly created Instances. + // - Any existing Network scheduling gate is actively removed. + // - The networking step is treated as immediately ready so Instances + // proceed to the runtime without a NetworkBinding. + // + // This flag exists so operators can run compute on edge/lab cells where + // VPC/NSO is not yet functional. The default is true (enabled) so that + // existing production deployments are unaffected. + // + // alpha: v0.1 + NetworkingIntegration featuregate.Feature = "NetworkingIntegration" +) + +// MutableFeatureGate is the mutable feature gate for the compute operator. +// Call MutableFeatureGate.AddFlag to register the --feature-gates flag before +// flag.Parse(). Controllers should read from FeatureGate (the read-only view) +// after startup. +var MutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() + +// FeatureGate is the read-only view of the compute operator feature gate. +// Use this in controllers and reconcilers rather than MutableFeatureGate to +// avoid accidental mutations after startup. +var FeatureGate featuregate.FeatureGate = MutableFeatureGate + +func init() { + if err := MutableFeatureGate.Add(map[featuregate.Feature]featuregate.FeatureSpec{ + NetworkingIntegration: {Default: true, PreRelease: featuregate.Alpha}, + }); err != nil { + panic(err) + } +} diff --git a/internal/features/features_test.go b/internal/features/features_test.go new file mode 100644 index 00000000..61687064 --- /dev/null +++ b/internal/features/features_test.go @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package features + +import ( + "testing" +) + +// TestNetworkingIntegration_DefaultEnabled verifies that the NetworkingIntegration +// feature gate defaults to enabled so that existing production deployments are +// unaffected when the flag is not set. +func TestNetworkingIntegration_DefaultEnabled(t *testing.T) { + // Use a fresh gate so this test is independent of any global state mutations. + gate := MutableFeatureGate.DeepCopy() + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration default = false, want true") + } +} + +// TestNetworkingIntegration_CanBeDisabled verifies that setting +// NetworkingIntegration=false via the feature gate string disables the +// integration, allowing operators to run compute without VPC/NSO. +func TestNetworkingIntegration_CanBeDisabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=false"); err != nil { + t.Fatalf("Set(NetworkingIntegration=false): %v", err) + } + if gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = true after Set=false, want false") + } +} + +// TestNetworkingIntegration_ExplicitlyEnabled verifies that the gate can be +// explicitly set to true (round-trip). +func TestNetworkingIntegration_ExplicitlyEnabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=true"); err != nil { + t.Fatalf("Set(NetworkingIntegration=true): %v", err) + } + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = false after Set=true, want true") + } +} diff --git a/internal/provider/milo/provider.go b/internal/provider/milo/provider.go new file mode 100644 index 00000000..927ec581 --- /dev/null +++ b/internal/provider/milo/provider.go @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package milo provides a multicluster provider that discovers Kubernetes clusters +// by watching Milo Project (and ProjectControlPlane) resources. +// +// This is a local fork of go.miloapis.com/milo/pkg/multicluster-runtime/milo adapted +// to be compatible with multicluster-runtime v0.23+, which changed ClusterName from a +// plain string to a distinct type (multicluster.ClusterName). +package milo + +import ( + "context" + "fmt" + "net/url" + "sync" + "time" + + "github.com/go-logr/logr" + infrastructurev1alpha1 "go.miloapis.com/milo/pkg/apis/infrastructure/v1alpha1" + resourcemanagerv1alpha1 "go.miloapis.com/milo/pkg/apis/resourcemanager/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" +) + +// Built following the cluster-api provider as an example. +// See: https://sigs.k8s.io/multicluster-runtime/blob/7abad14c6d65fdaf9b83a2b1d9a2c99140d18e7d/providers/cluster-api/provider.go + +var _ multicluster.Provider = &Provider{} + +var projectGVK = resourcemanagerv1alpha1.GroupVersion.WithKind("Project") +var projectControlPlaneGVK = infrastructurev1alpha1.GroupVersion.WithKind("ProjectControlPlane") + +// Options are the options for the Datum cluster Provider. +type Options struct { + // ClusterOptions are the options passed to the cluster constructor. + ClusterOptions []cluster.Option + + // InternalServiceDiscovery will result in the provider to look for + // ProjectControlPlane resources in the local manager's cluster, and establish + // a connection via the internal service address. Otherwise, the provider will + // look for Project resources in the cluster and expect to connect to the + // external Datum API endpoint. + InternalServiceDiscovery bool + + // ProjectRestConfig is the rest config to use when connecting to project + // API endpoints. If not provided, the provider will use the rest config + // from the local manager. + ProjectRestConfig *rest.Config + + // LabelSelector is an optional selector to filter projects based on labels. + // When provided, only projects matching this selector will be reconciled. + LabelSelector *metav1.LabelSelector +} + +// New creates a new Datum cluster Provider. +func New(localMgr manager.Manager, opts Options) (*Provider, error) { + p := &Provider{ + opts: opts, + log: log.Log.WithName("datum-cluster-provider"), + client: localMgr.GetClient(), + projectRestConfig: opts.ProjectRestConfig, + projects: map[string]cluster.Cluster{}, + cancelFns: map[string]context.CancelFunc{}, + } + + if p.projectRestConfig == nil { + p.projectRestConfig = localMgr.GetConfig() + } + + var project unstructured.Unstructured + if p.opts.InternalServiceDiscovery { + project.SetGroupVersionKind(projectControlPlaneGVK) + } else { + project.SetGroupVersionKind(projectGVK) + } + + var forOpts []builder.ForOption + if opts.LabelSelector != nil { + selector, err := metav1.LabelSelectorAsSelector(opts.LabelSelector) + if err != nil { + return nil, fmt.Errorf("failed to create selector from label selector: %w", err) + } + + labelPredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool { + return selector.Matches(labels.Set(obj.GetLabels())) + }) + + forOpts = append(forOpts, builder.WithPredicates(labelPredicate)) + } + + controllerBuilder := builder.ControllerManagedBy(localMgr). + For(&project, forOpts...). + WithOptions(controller.Options{MaxConcurrentReconciles: 1}). + Named("projectcontrolplane") + + if err := controllerBuilder.Complete(p); err != nil { + return nil, fmt.Errorf("failed to create controller: %w", err) + } + + return p, nil +} + +type index struct { + object client.Object + field string + extractValue client.IndexerFunc +} + +// Provider is a cluster Provider that works with Datum +type Provider struct { + opts Options + log logr.Logger + projectRestConfig *rest.Config + client client.Client + + lock sync.Mutex + mcMgr mcmanager.Manager + projects map[string]cluster.Cluster + cancelFns map[string]context.CancelFunc + indexers []index +} + +// Get returns the cluster with the given name, if it is known. +func (p *Provider) Get(_ context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) { + p.lock.Lock() + defer p.lock.Unlock() + if cl, ok := p.projects[clusterName.String()]; ok { + return cl, nil + } + + return nil, fmt.Errorf("cluster %s not found", clusterName) +} + +// Run starts the provider and blocks. +func (p *Provider) Run(ctx context.Context, mgr mcmanager.Manager) error { + p.log.Info("Starting Datum cluster provider") + + p.lock.Lock() + p.mcMgr = mgr + p.lock.Unlock() + + <-ctx.Done() + + return ctx.Err() +} + +func (p *Provider) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := p.log.WithValues("project", req.Name) + log.Info("Reconciling Project") + + // Use just the project name as the key for cluster lookup. + // This matches the project name used in URL paths and ParentNameExtraKey. + key := req.Name + var project unstructured.Unstructured + + if p.opts.InternalServiceDiscovery { + project.SetGroupVersionKind(projectControlPlaneGVK) + } else { + project.SetGroupVersionKind(projectGVK) + } + + if err := p.client.Get(ctx, req.NamespacedName, &project); err != nil { + if apierrors.IsNotFound(err) { + log.Info("Project not found, removing cluster if registered", "key", key) + p.lock.Lock() + defer p.lock.Unlock() + + if _, wasRegistered := p.projects[key]; wasRegistered { + log.Info("Removing previously registered cluster for project", "key", key) + } + delete(p.projects, key) + if cancel, ok := p.cancelFns[key]; ok { + cancel() + } + + return ctrl.Result{}, nil + } + + log.Error(err, "Failed to get project, will retry", "key", key) + return ctrl.Result{}, fmt.Errorf("failed to get project: %w", err) + } + + log.V(1).Info("Successfully fetched project", "name", project.GetName(), "namespace", project.GetNamespace()) + + p.lock.Lock() + defer p.lock.Unlock() + + // Make sure the manager has started + // TODO(jreese) what condition would lead to this? + if p.mcMgr == nil { + log.Info("Multicluster manager not yet started, requeueing", "key", key) + return ctrl.Result{RequeueAfter: time.Second * 2}, nil + } + + // already engaged? + if _, ok := p.projects[key]; ok { + log.V(1).Info("Project already engaged, skipping", "key", key) + return ctrl.Result{}, nil + } + + log.Info("Project not yet engaged, checking readiness", "key", key) + + // ready and provisioned? + conditions, err := extractUnstructuredConditions(project.Object) + if err != nil { + log.Error(err, "Failed to extract conditions from project", "key", key) + return ctrl.Result{}, err + } + + log.V(1).Info("Checking project readiness conditions", "key", key, "conditionCount", len(conditions)) + + if p.opts.InternalServiceDiscovery { + if !apimeta.IsStatusConditionTrue(conditions, "ControlPlaneReady") { + log.Info("ProjectControlPlane is not ready, skipping registration", "key", key, "conditions", conditions) + return ctrl.Result{}, nil + } + } else { + if !apimeta.IsStatusConditionTrue(conditions, "Ready") { + log.Info("Project is not ready, skipping registration", "key", key, "conditions", conditions) + return ctrl.Result{}, nil + } + } + + log.Info("Project is ready, proceeding with cluster registration", "key", key) + + cfg := rest.CopyConfig(p.projectRestConfig) + apiHost, err := url.Parse(cfg.Host) + if err != nil { + log.Error(err, "Failed to parse API host from rest config", "key", key, "host", cfg.Host) + return ctrl.Result{}, fmt.Errorf("failed to parse host from rest config: %w", err) + } + + if p.opts.InternalServiceDiscovery { + apiHost.Path = "" + apiHost.Host = fmt.Sprintf("milo-apiserver.project-%s.svc.cluster.local:6443", project.GetName()) + } else { + apiHost.Path = fmt.Sprintf("/apis/resourcemanager.miloapis.com/v1alpha1/projects/%s/control-plane", project.GetName()) + } + cfg.Host = apiHost.String() + + log.Info("Creating cluster connection", "key", key, "endpoint", cfg.Host) + + // create cluster. + cl, err := cluster.New(cfg, p.opts.ClusterOptions...) + if err != nil { + log.Error(err, "Failed to create cluster object", "key", key, "endpoint", cfg.Host) + return ctrl.Result{}, fmt.Errorf("failed to create cluster: %w", err) + } + for _, idx := range p.indexers { + if err := cl.GetCache().IndexField(ctx, idx.object, idx.field, idx.extractValue); err != nil { + log.Error(err, "Failed to setup cache index field", "key", key, "field", idx.field) + return ctrl.Result{}, fmt.Errorf("failed to index field %q: %w", idx.field, err) + } + } + + log.Info("Starting cluster cache", "key", key) + + clusterCtx, cancel := context.WithCancel(ctx) + go func() { + if err := cl.Start(clusterCtx); err != nil { + log.Error(err, "Cluster cache start failed", "key", key) + return + } + }() + + log.Info("Waiting for cluster cache to sync", "key", key) + + if !cl.GetCache().WaitForCacheSync(ctx) { + cancel() + log.Error(nil, "Cluster cache sync failed", "key", key) + return ctrl.Result{}, fmt.Errorf("failed to sync cache") + } + + log.Info("Cluster cache synced successfully", "key", key) + + // store project client + p.projects[key] = cl + p.cancelFns[key] = cancel + + log.Info("Engaging cluster with multicluster manager", "key", key) + + // engage manager. + if err := p.mcMgr.Engage(clusterCtx, multicluster.ClusterName(key), cl); err != nil { + log.Error(err, "Failed to engage cluster with multicluster manager", "key", key) + delete(p.projects, key) + delete(p.cancelFns, key) + return reconcile.Result{}, err + } + + log.Info("Successfully registered and engaged new cluster", "key", key, "endpoint", cfg.Host) + + return ctrl.Result{}, nil +} + +func (p *Provider) IndexField(ctx context.Context, obj client.Object, field string, extractValue client.IndexerFunc) error { + p.lock.Lock() + defer p.lock.Unlock() + + // save for future projects. + p.indexers = append(p.indexers, index{ + object: obj, + field: field, + extractValue: extractValue, + }) + + // apply to existing projects. + for name, cl := range p.projects { + if err := cl.GetCache().IndexField(ctx, obj, field, extractValue); err != nil { + return fmt.Errorf("failed to index field %q on project %q: %w", field, name, err) + } + } + return nil +} + +func extractUnstructuredConditions( + obj map[string]interface{}, +) ([]metav1.Condition, error) { + conditions, ok, _ := unstructured.NestedSlice(obj, "status", "conditions") + if !ok { + return nil, nil + } + + wrappedConditions := map[string]interface{}{ + "conditions": conditions, + } + + var typedConditions struct { + Conditions []metav1.Condition `json:"conditions"` + } + + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(wrappedConditions, &typedConditions); err != nil { + return nil, fmt.Errorf("failed converting unstructured conditions: %w", err) + } + + return typedConditions.Conditions, nil +} diff --git a/internal/quota/client.go b/internal/quota/client.go new file mode 100644 index 00000000..acef469c --- /dev/null +++ b/internal/quota/client.go @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package quota + +import ( + "context" + "fmt" + "net/url" + "sync" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ProjectQuotaClientManager builds and caches controller-runtime clients that +// target individual Milo project control planes. It is safe for concurrent use. +type ProjectQuotaClientManager struct { + baseRestConfig *rest.Config + clients sync.Map // key: projectID (string), value: client.Client +} + +// New returns a ProjectQuotaClientManager that derives per-project REST configs +// from baseRestConfig by rewriting the host path. +func New(baseRestConfig *rest.Config) *ProjectQuotaClientManager { + return &ProjectQuotaClientManager{baseRestConfig: baseRestConfig} +} + +// StoreClient pre-populates the cache with a pre-built client for projectID. +// This is intended for use in unit tests where a real REST server is unavailable. +func (m *ProjectQuotaClientManager) StoreClient(projectID string, cl client.Client) { + m.clients.Store(projectID, cl) +} + +// ClientForProject returns a client.Client targeting the Milo project control +// plane for projectID. The client is constructed once and cached for subsequent +// calls. scheme must include all types the caller intends to operate on, +// including quotav1alpha1. +func (m *ProjectQuotaClientManager) ClientForProject( + ctx context.Context, + projectID string, + scheme *runtime.Scheme, +) (client.Client, error) { + if v, ok := m.clients.Load(projectID); ok { + return v.(client.Client), nil + } + + cfg := rest.CopyConfig(m.baseRestConfig) + apiHost, err := url.Parse(cfg.Host) + if err != nil { + return nil, fmt.Errorf("failed to parse base host: %w", err) + } + apiHost.Path = fmt.Sprintf( + "/apis/resourcemanager.miloapis.com/v1alpha1/projects/%s/control-plane", + projectID, + ) + cfg.Host = apiHost.String() + + cl, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create client for project %q: %w", projectID, err) + } + + m.clients.Store(projectID, cl) + return cl, nil +} diff --git a/internal/quota/metrics.go b/internal/quota/metrics.go new file mode 100644 index 00000000..5f1788cd --- /dev/null +++ b/internal/quota/metrics.go @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package quota + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Metric reason label values for quota_eval_failures_total. +const ( + ReasonBackendUnavailable = "backend_unavailable" + ReasonProjectNotFound = "project_not_found" + ReasonNamespaceNotFound = "namespace_not_found" + ReasonMisconfigured = "misconfigured" + ReasonProjectIDUnresolvable = "project_id_unresolvable" + ReasonNoBudget = "no_budget" +) + +var ( + // EnforcementEnabled is a gauge set to 1 when quota enforcement is active + // (a credential path is configured) and 0 when disabled (no path configured). + // This gives dashboards and alerting a stable signal rather than log scraping. + EnforcementEnabled = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "compute_quota_enforcement_enabled", + Help: "1 if quota enforcement is active, 0 if disabled (no credential configured).", + }) + + // EvalFailuresTotal counts quota evaluation failures by reason code. + // Incremented each time quota evaluation fails for a reason other than the + // normal quota-exceeded or quota-pending flow. + EvalFailuresTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "compute_quota_eval_failures_total", + Help: "Total quota evaluation failures by reason code.", + }, []string{"reason"}) + + // ClaimOrphanedTotal counts ResourceClaims orphaned during instance deletion + // because the project ID could not be resolved at deletion time. + ClaimOrphanedTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "compute_quota_claim_orphaned_total", + Help: "Total ResourceClaims orphaned because project ID could not be resolved at deletion.", + }) +) + +func init() { + ctrlmetrics.Registry.MustRegister( + EnforcementEnabled, + EvalFailuresTotal, + ClaimOrphanedTotal, + ) +} diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 7f112822..b8a068f4 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -17,12 +17,25 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Validation constants for well-known string literals used across multiple +// validation functions. +const ( + // diskTypePDStandard is the only currently supported disk type. + diskTypePDStandard = "pd-standard" + + // defaultImageName is the only currently supported container image. + defaultImageName = "datumcloud/ubuntu-2204-lts" + + // defaultInstanceType is the only currently supported instance type. + defaultInstanceType = "datumcloud/d1-standard-2" +) + func validateInstanceTemplate( template computev1alpha.InstanceTemplateSpec, fieldPath *field.Path, opts WorkloadValidationOptions, ) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, 2) allErrs = append(allErrs, validateInstanceTemplateMetadata(template, fieldPath)...) allErrs = append(allErrs, validateInstanceSpec(template.Spec, fieldPath.Child("spec"), opts)...) @@ -66,7 +79,7 @@ func validateInstanceSpec( fieldPath *field.Path, opts WorkloadValidationOptions, ) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, 3) volumes, volumeErrs := validateVolumes(spec, fieldPath) allErrs = append(allErrs, volumeErrs...) @@ -97,6 +110,11 @@ func validateInstanceNetworkInterfaces( allErrs = append(allErrs, field.Invalid(networkNameField, networkInterface.Network, msg)) } + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + review := authorizationv1.SubjectAccessReview{ Spec: authorizationv1.SubjectAccessReviewSpec{ ResourceAttributes: &authorizationv1.ResourceAttributes{ @@ -110,6 +128,7 @@ func validateInstanceNetworkInterfaces( User: opts.AdmissionRequest.UserInfo.Username, Groups: opts.AdmissionRequest.UserInfo.Groups, UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, }, } @@ -258,8 +277,8 @@ func validateDiskVolumeSource(diskSource *computev1alpha.DiskTemplateVolumeSourc diskTemplateSpecField := diskTemplateField.Child("spec") // TODO(jrese) look up valid disk types - if diskTemplate.Spec.Type != "pd-standard" { - allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{"pd-standard"})) + if diskTemplate.Spec.Type != diskTypePDStandard { + allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{diskTypePDStandard})) } populatorResourceRequests, errs := validateDiskPopulator(diskTemplate.Spec.Populator, diskTemplateField.Child("populator")) @@ -400,8 +419,8 @@ func validateDiskPopulator(populator *computev1alpha.DiskPopulator, fieldPath *f // TODO(jreese) look up image imagePopulator := populator.Image - if imagePopulator.Name != "datumcloud/ubuntu-2204-lts" { - allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{"datumcloud/ubuntu-2204-lts"})) + if imagePopulator.Name != defaultImageName { + allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{defaultImageName})) } } } @@ -457,7 +476,7 @@ func validateInstanceRuntimeSpec(spec computev1alpha.InstanceRuntimeSpec, volume } func validateSandboxRuntime(sandbox *computev1alpha.SandboxRuntime, volumes map[string]computev1alpha.VolumeSource, fieldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, 4) allErrs = append(allErrs, validateSandboxContainers(sandbox.Containers, volumes, fieldPath.Child("containers"))...) allErrs = append(allErrs, validateImagePullSecrets(sandbox.ImagePullSecrets, fieldPath.Child("imagePullSecrets"))...) @@ -572,7 +591,7 @@ func validateVolumeAttachments( volumes map[string]computev1alpha.VolumeSource, fieldPath *field.Path, ) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, len(attachments)) allMounthPaths := sets.Set[string]{} @@ -657,8 +676,8 @@ func validateInstanceRuntimeResources(resources computev1alpha.InstanceRuntimeRe allErrs := field.ErrorList{} // TODO(jreese) look up available instance types - if resources.InstanceType != "datumcloud/d1-standard-2" { - allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{"datumcloud/d1-standard-2"})) + if resources.InstanceType != defaultInstanceType { + allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{defaultInstanceType})) } if resources.Requests != nil { diff --git a/internal/validation/workload_validation.go b/internal/validation/workload_validation.go index 5f320e9a..c18fcbcb 100644 --- a/internal/validation/workload_validation.go +++ b/internal/validation/workload_validation.go @@ -18,7 +18,7 @@ import ( // https://github.com/kubernetes/kubernetes/blob/master/pkg/apis/core/validation/validation.go func ValidateWorkloadCreate(w *computev1alpha.Workload, opts WorkloadValidationOptions) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, 4) // allErrs = append(allErrs, validateWorkloadMetadata(w)...) allErrs = append(allErrs, validateWorkloadSpec(w.Spec, opts)...) @@ -35,7 +35,7 @@ type WorkloadValidationOptions struct { } func validateWorkloadSpec(spec computev1alpha.WorkloadSpec, opts WorkloadValidationOptions) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, 4) specPath := field.NewPath("spec") @@ -111,7 +111,7 @@ func validateScaleSettings(placement computev1alpha.HorizontalScaleSettings, fie } func validateScaleSettingMetrics(metrics []computev1alpha.MetricSpec, fieldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} + allErrs := make(field.ErrorList, 0, len(metrics)) for i, m := range metrics { metricField := fieldPath.Index(i) diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index f73e4c9f..7b785683 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -23,6 +23,15 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Test constants for repeated string literals. +const ( + testCPUResource = "cpu" + testVolName = "vol" + testDuplicateMountPath = "duplicate-mount-path" + testDefaultNamespace = "default" + testCityCodeDFW = "DFW" +) + func TestValidateWorkloads(t *testing.T) { scenarios := map[string]struct { workload *computev1alpha.Workload @@ -157,7 +166,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(50, resource.DecimalSI), AverageValue: resource.NewQuantity(50, resource.DecimalSI), @@ -181,7 +190,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -202,7 +211,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageValue: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -223,7 +232,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageUtilization: proto.Int32(0), }, @@ -336,16 +345,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Gi"), @@ -369,16 +378,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Pi"), @@ -402,16 +411,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10.5Gi"), @@ -436,7 +445,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -473,7 +482,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -490,11 +499,11 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { @@ -503,7 +512,7 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, VolumeSource: volumeSource, }, } @@ -540,7 +549,7 @@ func TestValidateWorkloads(t *testing.T) { interceptorFuncs: &interceptor.Funcs{ Create: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.CreateOption) error { if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { - if sar.Spec.ResourceAttributes.Name == "default" && + if sar.Spec.ResourceAttributes.Name == testDefaultNamespace && sar.Spec.ResourceAttributes.Group == networkingv1alpha.GroupVersion.Group && sar.Spec.ResourceAttributes.Version == networkingv1alpha.GroupVersion.Version && sar.Spec.ResourceAttributes.Resource == "networks" { @@ -559,8 +568,8 @@ func TestValidateWorkloads(t *testing.T) { initObjs := []client.Object{ &networkingv1alpha.Network{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "default", + Namespace: testDefaultNamespace, + Name: testDefaultNamespace, }, }, } @@ -606,7 +615,7 @@ func TestValidateWorkloads(t *testing.T) { ) if len(scenario.opts.ValidCityCodes) == 0 { - scenario.opts.ValidCityCodes = []string{"DFW"} + scenario.opts.ValidCityCodes = []string{testCityCodeDFW} } t.Run(name, func(t *testing.T) { @@ -639,13 +648,13 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{ { Network: networkingv1alpha.NetworkRef{ - Name: "default", + Name: testDefaultNamespace, }, }, }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, Sandbox: &computev1alpha.SandboxRuntime{ Containers: []computev1alpha.SandboxContainer{ @@ -661,7 +670,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, @@ -696,13 +705,13 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{ { Network: networkingv1alpha.NetworkRef{ - Name: "default", + Name: testDefaultNamespace, }, }, }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, VirtualMachine: &computev1alpha.VirtualMachineRuntime{ VolumeAttachments: []computev1alpha.VolumeAttachment{ @@ -719,10 +728,10 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Populator: &computev1alpha.DiskPopulator{ Image: &computev1alpha.ImageDiskPopulator{ - Name: "datumcloud/ubuntu-2204-lts", + Name: defaultImageName, }, }, }, @@ -736,7 +745,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index e3f3735c..199508fb 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -1,17 +1,18 @@ +// SPDX-License-Identifier: AGPL-3.0-only + package webhook import ( "context" - "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/validation" @@ -27,8 +28,7 @@ func SetupWorkloadWebhookWithManager(mgr mcmanager.Manager) error { mgr: mgr, } - return ctrl.NewWebhookManagedBy(mgr.GetLocalManager()). - For(&computev1alpha.Workload{}). + return ctrl.NewWebhookManagedBy(mgr.GetLocalManager(), &computev1alpha.Workload{}). WithDefaulter(webhook). WithValidator(webhook). Complete() @@ -40,17 +40,11 @@ type workloadWebhook struct { mgr mcmanager.Manager } -var _ admission.CustomDefaulter = &workloadWebhook{} -var _ admission.CustomValidator = &workloadWebhook{} - -// Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return fmt.Errorf("unexpected type %T", obj) - } - _ = workload +var _ admission.Defaulter[*computev1alpha.Workload] = &workloadWebhook{} +var _ admission.Validator[*computev1alpha.Workload] = &workloadWebhook{} +// Default implements admission.Defaulter so a mutating webhook will be registered for the type. +func (r *workloadWebhook) Default(_ context.Context, _ *computev1alpha.Workload) error { // // TODO(jreese) review and test gateway defaulting / logic // if gw := workload.Spec.Gateway; gw != nil { // for i, tcpRoute := range gw.TCPRoutes { @@ -75,15 +69,10 @@ func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error // +kubebuilder:webhook:path=/validate-compute-datumapis-com-v1alpha-workload,mutating=false,failurePolicy=fail,sideEffects=None,groups=compute.datumapis.com,resources=workloads,verbs=create;update,versions=v1alpha,name=vworkload.kb.io,admissionReviewVersions=v1 -func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - +func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev1alpha.Workload) (admission.Warnings, error) { clusterName := computewebhook.ClusterNameFromContext(ctx) - cluster, err := r.mgr.GetCluster(ctx, clusterName) + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) if err != nil { return nil, err } @@ -101,9 +90,9 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object // that means for the scheduling phase, since there would not currently be // sufficient context to know who created the workload and what locations // are valid candidates based on that. Maybe an annotation, or spec field? - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := clusterClient.List(ctx, &locations); err != nil { - return nil, fmt.Errorf("failed to list locations: %w", err) + return nil, fmt.Errorf("failed to list location bindings: %w", err) } validCityCodes := sets.Set[string]{} @@ -123,38 +112,18 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object } if errs := validation.ValidateWorkloadCreate(workload, opts); len(errs) > 0 { - return nil, errors.NewInvalid(obj.GetObjectKind().GroupVersionKind().GroupKind(), workload.Name, errs) + return nil, errors.NewInvalid(workload.GroupVersionKind().GroupKind(), workload.Name, errs) } return nil, nil } -func (r *workloadWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldworkload, ok := oldObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", oldObj) - } - - _ = oldworkload - - newworkload, ok := newObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", newObj) - } - - _ = newworkload - +func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object update. return nil, nil } -func (r *workloadWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - _ = workload - +func (r *workloadWebhook) ValidateDelete(_ context.Context, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object deletion. return nil, nil } diff --git a/test/e2e/chainsaw-config.yaml b/test/e2e/chainsaw-config.yaml new file mode 100644 index 00000000..cd3a9950 --- /dev/null +++ b/test/e2e/chainsaw-config.yaml @@ -0,0 +1,47 @@ +# Chainsaw global configuration for the compute federation e2e test suite. +# +# Prerequisites +# ───────────── +# Run `task e2e:up` to create the Kind clusters and populate kubeconfigs under +# tmp/e2e/kubeconfigs/ before running Chainsaw. +# +# Running +# ─────── +# From the repository root via Taskfile (recommended): +# +# task e2e:test +# +# Or directly: +# +# KUBECONFIG=tmp/e2e/kubeconfigs/control-plane.yaml \ +# chainsaw test --config test/e2e/chainsaw-config.yaml test/e2e/ +# +# The KUBECONFIG env var sets the "default" cluster (control-plane cell). +# Additional clusters (downstream, pop-dfw, pop-ord) are declared below and +# referenced by name in individual test steps via `cluster: downstream` etc. +# +# Kubeconfig paths below are relative to the working directory where Chainsaw is +# invoked (the project root), NOT relative to this config file's location. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 30s + assert: 60s + cleanup: 60s + delete: 30s + error: 30s + exec: 30s + clusters: + # Downstream control plane. WorkloadDeployments, PropagationPolicies, + # and Instance write-backs live here. + downstream: + kubeconfig: tmp/e2e/kubeconfigs/downstream.yaml + # POP DFW cell — downstream member cluster labelled topology.datum.net/city-code=dfw. + pop-dfw: + kubeconfig: tmp/e2e/kubeconfigs/pop-dfw.yaml + # POP ORD cell — downstream member cluster labelled topology.datum.net/city-code=ord. + pop-ord: + kubeconfig: tmp/e2e/kubeconfigs/pop-ord.yaml diff --git a/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml new file mode 100644 index 00000000..aae65da1 --- /dev/null +++ b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml @@ -0,0 +1,7 @@ +# Assert the WorkloadDeployment is present in the Karmada API server. +# Used both to confirm federation succeeded and as the target for the error: check. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-cascade-wd diff --git a/test/e2e/deletion-cascade/chainsaw-test.yaml b/test/e2e/deletion-cascade/chainsaw-test.yaml new file mode 100644 index 00000000..03a11ea0 --- /dev/null +++ b/test/e2e/deletion-cascade/chainsaw-test.yaml @@ -0,0 +1,79 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: deletion-cascade +spec: + description: | + Verifies that deleting a WorkloadDeployment from the project namespace causes + the federator to remove the corresponding WorkloadDeployment from Karmada. + + The WorkloadDeploymentFederator adds a finalizer + (compute.datumapis.com/federator) to every project WD it manages. When the + project WD is deleted: + 1. The finalizer's Finalize method runs (blocking deletion until complete). + 2. It deletes the Karmada-side WorkloadDeployment. + 3. It removes the PropagationPolicy if no other WDs for the city remain. + 4. It removes the finalizer, allowing the project WD to be garbage-collected. + + This test validates: project WD deletion → Karmada WD deletion. + + template: true + + steps: + - name: create-wd + description: Create a WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-federation + description: Wait for the WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-cascade-wd + + - name: delete-wd + description: Delete the WorkloadDeployment from the control-plane cluster. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: test-cascade-wd + + - name: assert-downstream-wd-deleted + description: Confirm the Karmada copy is removed by the finalizer. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($downstreamNS) + name: test-cascade-wd + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/deletion-cascade/workload-deployment.yaml b/test/e2e/deletion-cascade/workload-deployment.yaml new file mode 100644 index 00000000..39d68a1d --- /dev/null +++ b/test/e2e/deletion-cascade/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-cascade-wd +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/env/README.md b/test/e2e/env/README.md new file mode 100644 index 00000000..671e705d --- /dev/null +++ b/test/e2e/env/README.md @@ -0,0 +1,251 @@ +# Local Kind + Karmada e2e Environment + +This document describes the local multi-cluster environment used for end-to-end +testing of the compute federation layer. + +--- + +## Prerequisites + +| Tool | Minimum version | Install | +|------|----------------|---------| +| [Docker Desktop](https://www.docker.com/products/docker-desktop/) | 4.x | required for Kind | +| [kind](https://kind.sigs.k8s.io/) | v0.23+ | `brew install kind` | +| [kubectl](https://kubernetes.io/docs/tasks/tools/) | v1.28+ | `brew install kubernetes-cli` | +| [helm](https://helm.sh/) | v3.14+ | `brew install helm` | +| [task](https://taskfile.dev/) | v3 | `brew install go-task` | +| Python 3 | 3.9+ | pre-installed on macOS | +| go | 1.24+ | `brew install go` | + +`karmadactl` is downloaded automatically by `task e2e:up` into `./bin/`. + +--- + +## Cluster Topology + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ compute-control-plane (Kind cluster) │ +│ │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ karmada-system namespace │ │ +│ │ Karmada API Server ←── https://localhost:32443 │ │ +│ │ Karmada Controller Manager │ │ +│ │ Karmada Scheduler │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +│ │ +│ compute operator (WorkloadReconciler, Federator, InstanceProjector)│ +└──────────────────────────┬──────────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployments + ┌────────────────┴─────────────────┐ + │ │ +┌─────────▼──────────┐ ┌──────────▼─────────┐ +│ compute-pop-dfw │ │ compute-pop-ord │ +│ (Kind cluster) │ │ (Kind cluster) │ +│ │ │ │ +│ city-code=dfw │ │ city-code=ord │ +│ Compute CRDs │ │ Compute CRDs │ +│ NSO CRDs │ │ NSO CRDs │ +└────────────────────┘ └────────────────────┘ +``` + +### What lives where + +| Resource | Cluster | +|----------|---------| +| `Workload`, `WorkloadDeployment` (consumer-facing) | Control Plane Cell | +| `WorkloadDeployment` (federation intent), `PropagationPolicy` | Karmada API Server | +| `WorkloadDeployment` (propagated), `Instance`, `NetworkBinding`, `SubnetClaim` | POP cells | +| `Instance` (write-back for visibility) | Karmada API Server | + +--- + +## Running the environment + +### Start + +```bash +task e2e:up +``` + +This is fully idempotent — running it twice will not fail. + +What it does, in order: + +1. Downloads `karmadactl v1.16.0` into `./bin/` (once). +2. Adds the `karmada-charts` Helm repository. +3. Creates Kind clusters `compute-control-plane`, `compute-pop-dfw`, + `compute-pop-ord` (skips any that already exist). +4. Exports kubeconfigs to `./tmp/e2e/kubeconfigs/`. +5. Installs Karmada v1.16.0 via the `karmada-charts/karmada` Helm chart into + `compute-control-plane`, with the API server exposed on NodePort 32443. +6. Registers `compute-pop-dfw` and `compute-pop-ord` as member clusters and + labels each with `topology.datum.net/city-code`. +7. Installs compute CRDs to all clusters and the Karmada API server. +8. Installs NSO CRDs to the POP cell clusters. + +### Stop + +```bash +task e2e:down +``` + +Deletes all three Kind clusters and removes `./tmp/e2e/`. + +--- + +## Kubeconfigs + +After `task e2e:up`: + +| File | Cluster | Use for | +|------|---------|---------| +| `tmp/e2e/kubeconfigs/control-plane.yaml` | `compute-control-plane` | kubectl, deploying the compute operator | +| `tmp/e2e/kubeconfigs/karmada.yaml` | Karmada API server | kubectl, karmadactl | +| `tmp/e2e/kubeconfigs/pop-dfw.yaml` | `compute-pop-dfw` | kubectl, inspecting POP cell state | +| `tmp/e2e/kubeconfigs/pop-ord.yaml` | `compute-pop-ord` | kubectl, inspecting POP cell state | + +The `-internal.yaml` variants use the Kind container's Docker bridge IP and are +intended for the Karmada controller running inside Docker — not for direct +developer use. + +### Quick check + +```bash +# Verify cluster list in Karmada +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get clusters + +# Expected output: +# NAME READY AGE +# compute-pop-dfw True ... +# compute-pop-ord True ... + +# Verify city-code labels +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get clusters -L topology.datum.net/city-code +``` + +--- + +## Using the environment from e2e tests + +Import `go.datum.net/compute/test/e2e/env` in your test suite: + +```go +package myfeature_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + computev1alpha1 "go.datum.net/compute/api/v1alpha1" + + "go.datum.net/compute/test/e2e/env" +) + +var testEnv *env.Environment + +func TestMyFeature(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "MyFeature Suite") +} + +var _ = BeforeSuite(func() { + scheme := runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) + + var err error + testEnv, err = env.New(scheme) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = It("creates a workload and propagates it", func() { + // Control plane cluster client + cpClient := testEnv.ControlPlane.Client + + // Karmada API server client + karmadaClient := testEnv.Karmada.Client + + // POP DFW cluster client + dfwCell, err := testEnv.POPCell(env.CityCodeDFW) + Expect(err).NotTo(HaveOccurred()) + dfwClient := dfwCell.Client + + _ = cpClient + _ = karmadaClient + _ = dfwClient +}) +``` + +### Environment variable override + +Set `E2E_KUBECONFIG_DIR` to an absolute path to load kubeconfigs from a +different directory (useful in CI): + +```bash +E2E_KUBECONFIG_DIR=/path/to/kubeconfigs go test ./test/e2e/... +``` + +--- + +## Networking notes (macOS) + +On macOS with Docker Desktop, Kind clusters run as Docker containers. The +container-to-container networking works as follows: + +| From | To | Address used | +|------|----|--------------| +| macOS host | Any Kind cluster API server | `localhost:` | +| macOS host | Karmada API server | `https://localhost:32443` (NodePort) | +| Karmada controller (in Docker) | POP cell API servers | Docker bridge IP (`172.18.x.x:6443`) | + +The `-internal.yaml` kubeconfig variants use Docker bridge IPs with +`insecure-skip-tls-verify: true` because the node certificates do not include +bridge IPs in their SANs. This is acceptable for a local dev environment. + +--- + +## Troubleshooting + +### Karmada API server not reachable + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get ns +``` + +If this times out, check: +1. The Kind cluster is running: `kind get clusters` +2. Port 32443 is mapped: `docker port compute-control-plane-control-plane` +3. The karmada-apiserver pod is running: + ```bash + kubectl --kubeconfig tmp/e2e/kubeconfigs/control-plane.yaml \ + get pods -n karmada-system + ``` + +### POP cluster shows NotReady in Karmada + +The Karmada controller manager uses the Docker bridge IP kubeconfig to reach +POP cells. Check: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + describe cluster compute-pop-dfw +``` + +Then verify the cluster secret contains the expected Docker IP: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get secret -n karmada-system | grep pop-dfw +``` + +### Start fresh + +```bash +task e2e:down && task e2e:up +``` diff --git a/test/e2e/env/env.go b/test/e2e/env/env.go new file mode 100644 index 00000000..7d2c59c6 --- /dev/null +++ b/test/e2e/env/env.go @@ -0,0 +1,233 @@ +// Package env provides helpers for connecting to the local Kind e2e environment +// created by "task e2e:up". +// +// # Environment layout +// +// The environment consists of three Kind clusters and one downstream API server: +// +// - Control plane cell — hosts the compute operator (WorkloadReconciler, +// WorkloadDeploymentFederator, InstanceProjector). +// - Downstream control plane — the federation API server; WorkloadDeployments +// are written here so they can be propagated to POP cells. +// - POP DFW (compute-pop-dfw) — member cluster labelled city-code=dfw. +// - POP ORD (compute-pop-ord) — member cluster labelled city-code=ord. +// +// # Kubeconfig resolution +// +// Kubeconfigs are read from the directory at [DefaultKubeconfigDir] (relative +// to the repository root), unless overridden via the [EnvKubeconfigDir] +// environment variable. +// +// Expected files inside that directory: +// +// control-plane.yaml — management / control-plane cell +// downstream.yaml — downstream federation API server (https://localhost:32443) +// pop-dfw.yaml — POP DFW cell (standard Kind localhost-based kubeconfig) +// pop-ord.yaml — POP ORD cell (standard Kind localhost-based kubeconfig) +// +// # Typical usage in a Ginkgo suite +// +// var ( +// testEnv *env.Environment +// ) +// +// var _ = BeforeSuite(func() { +// scheme := runtime.NewScheme() +// Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) +// Expect(corev1.AddToScheme(scheme)).To(Succeed()) +// +// var err error +// testEnv, err = env.New(scheme) +// Expect(err).NotTo(HaveOccurred()) +// }) +package env + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variable name that overrides the kubeconfig directory. +const EnvKubeconfigDir = "E2E_KUBECONFIG_DIR" + +// DefaultKubeconfigDir is the kubeconfig directory used when [EnvKubeconfigDir] +// is not set. It is resolved relative to the repository root (three directories +// above this source file). +const DefaultKubeconfigDir = "tmp/e2e/kubeconfigs" + +// City codes for the two POP cells created by "task e2e:up". +const ( + CityCodeDFW = "dfw" + CityCodeORD = "ord" +) + +// Environment holds a [ClusterAccess] for each cluster in the local e2e +// environment. All fields are populated by [New]; none are nil on success. +type Environment struct { + // ControlPlane is the management / control-plane cell cluster. + // The compute operator runs here (WorkloadReconciler, + // WorkloadDeploymentFederator, InstanceProjector). + ControlPlane *ClusterAccess + + // Downstream is the downstream control plane. + // WorkloadDeployments and PropagationPolicies live here. + Downstream *ClusterAccess + + // POPCells maps city-code strings (e.g. "dfw", "ord") to the + // corresponding POP cell cluster. Use [Environment.POPCell] for + // safe, error-returning access. + POPCells map[string]*ClusterAccess +} + +// ClusterAccess bundles a REST config and a controller-runtime Client for a +// single cluster. +type ClusterAccess struct { + // Config is the REST config used to build the client. + Config *rest.Config + + // Client is a controller-runtime client scoped to this cluster. + // The client is built with the scheme supplied to [New]. + Client ctrlclient.Client +} + +// New creates an [Environment] by loading kubeconfigs from the configured +// directory and building a controller-runtime client for each cluster using +// the provided scheme. +// +// The scheme should have all relevant types registered before calling New; +// for example compute types, networking types, and core Kubernetes types. +func New(scheme *k8sruntime.Scheme) (*Environment, error) { + dir := kubeconfigDir() + + controlPlane, err := loadCluster(filepath.Join(dir, "control-plane.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("control-plane cluster: %w", err) + } + + downstream, err := loadCluster(filepath.Join(dir, "downstream.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("downstream control plane: %w", err) + } + + popDFW, err := loadCluster(filepath.Join(dir, "pop-dfw.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP DFW cluster: %w", err) + } + + popORD, err := loadCluster(filepath.Join(dir, "pop-ord.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP ORD cluster: %w", err) + } + + return &Environment{ + ControlPlane: controlPlane, + Downstream: downstream, + POPCells: map[string]*ClusterAccess{ + CityCodeDFW: popDFW, + CityCodeORD: popORD, + }, + }, nil +} + +// POPCell returns the [ClusterAccess] for the POP cell with the given city +// code. It returns an error if no POP cell is registered for that code. +func (e *Environment) POPCell(cityCode string) (*ClusterAccess, error) { + ca, ok := e.POPCells[cityCode] + if !ok { + known := make([]string, 0, len(e.POPCells)) + for k := range e.POPCells { + known = append(known, k) + } + return nil, fmt.Errorf("no POP cell registered for city code %q (known: %v)", cityCode, known) + } + return ca, nil +} + +// MustPOPCell is like [Environment.POPCell] but panics on error. +// Useful in test setup where a missing POP cell is always a fatal misconfiguration. +func (e *Environment) MustPOPCell(cityCode string) *ClusterAccess { + ca, err := e.POPCell(cityCode) + if err != nil { + panic(err) + } + return ca +} + +// RESTConfigFor is a convenience function that returns a [rest.Config] for the +// named cluster without constructing a client. Useful when the caller needs to +// build a typed clientset directly. +func RESTConfigFor(kubeconfigPath string) (*rest.Config, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + return cfg, nil +} + +// KubeconfigPath returns the absolute path to the kubeconfig file for the +// named cluster. name must be one of "control-plane", "downstream", "pop-dfw", +// or "pop-ord". +func KubeconfigPath(name string) string { + return filepath.Join(kubeconfigDir(), name+".yaml") +} + +// ─── internal helpers ──────────────────────────────────────────────────────── + +func loadCluster(kubeconfigPath string, scheme *k8sruntime.Scheme) (*ClusterAccess, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + + c, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("building client from %s: %w", kubeconfigPath, err) + } + + return &ClusterAccess{ + Config: cfg, + Client: c, + }, nil +} + +// kubeconfigDir returns the directory containing e2e kubeconfigs. +// It honours the E2E_KUBECONFIG_DIR environment variable, otherwise falls +// back to /tmp/e2e/kubeconfigs. +func kubeconfigDir() string { + if dir := os.Getenv(EnvKubeconfigDir); dir != "" { + return dir + } + return filepath.Join(repoRoot(), DefaultKubeconfigDir) +} + +// repoRoot walks up from this source file to find the repository root +// (identified by the presence of go.mod). +func repoRoot() string { + // Use the file path of this source file as a starting point so the helper + // works regardless of the caller's working directory. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + // Fallback: assume tests are run from the repo root. + return "." + } + + dir := filepath.Dir(thisFile) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + // Reached filesystem root without finding go.mod. + return "." + } + dir = parent + } +} diff --git a/test/e2e/full-federation/chainsaw-test.yaml b/test/e2e/full-federation/chainsaw-test.yaml new file mode 100644 index 00000000..020a2bc9 --- /dev/null +++ b/test/e2e/full-federation/chainsaw-test.yaml @@ -0,0 +1,150 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: full-federation +spec: + description: | + End-to-end federation chain test. + + Exercises the complete path from WorkloadDeployment creation through to + Instance projection on the control-plane cluster: + + 1. Create WorkloadDeployment on control-plane. + 2. WorkloadDeploymentFederator replicates it to Karmada (ns- namespace). + 3. Karmada PropagationPolicy routes the WD to pop-dfw. + 4. WorkloadDeploymentReconciler on pop-dfw creates Instance test-full-fed-wd-0. + 5. InstanceReconciler on pop-dfw writes Instance back to Karmada with + label meta.datumapis.com/upstream-cluster-name: cluster-single. + 6. InstanceProjector on control-plane creates a projection of the Instance + in the project namespace. + + Prerequisites: both operator instances must be running (task e2e:operator:start). + + template: true + + steps: + - name: create-workload-deployment + description: Create the WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeploymentFederator replicated the WD to Karmada and status is synced back. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + - assert: + # Wait for the cell operator to write status back to the Karmada WD. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-wd-on-pop-dfw + description: Assert Karmada propagated the WD to pop-dfw and the cell reconciler set status. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + # Karmada propagation can take longer than a local apply. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-instance-on-pop-dfw + description: Assert WorkloadDeploymentReconciler created an Instance on pop-dfw with a Ready condition. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" + + - name: assert-instance-writeback-in-downstream + description: Assert InstanceReconciler wrote the Instance back to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + + - name: assert-instance-projected-to-control-plane + description: Assert InstanceProjector created a projection with status on the control-plane. + try: + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($namespace) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" diff --git a/test/e2e/full-federation/workload-deployment.yaml b/test/e2e/full-federation/workload-deployment.yaml new file mode 100644 index 00000000..70b4cb94 --- /dev/null +++ b/test/e2e/full-federation/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-full-fed-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/instance-projection/assert-downstream-wd.yaml b/test/e2e/instance-projection/assert-downstream-wd.yaml new file mode 100644 index 00000000..705d0893 --- /dev/null +++ b/test/e2e/instance-projection/assert-downstream-wd.yaml @@ -0,0 +1,6 @@ +# Assert the WorkloadDeployment is federated to Karmada (and the Karmada namespace created). +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-projector-wd diff --git a/test/e2e/instance-projection/assert-projected-instance.yaml b/test/e2e/instance-projection/assert-projected-instance.yaml new file mode 100644 index 00000000..0542194d --- /dev/null +++ b/test/e2e/instance-projection/assert-projected-instance.yaml @@ -0,0 +1,19 @@ +# Assert the InstanceProjector created a projection in the project namespace. +# +# The InstanceProjector (internal/controller/instance_projector.go): +# - Watches Instances in Karmada that carry upstreamClusterNameLabel +# - Strips "cluster-" prefix to get the cluster name ("single" in single-provider mode) +# - Finds the project namespace by matching ns- to namespace UIDs +# - Creates/updates the Instance projection in the project namespace +# - Sets an owner reference to the WorkloadDeployment for cascading deletion +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + # namespace is the Chainsaw test namespace (the project namespace on control-plane) + name: test-projected-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + ownerReferences: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + name: test-projector-wd diff --git a/test/e2e/instance-projection/chainsaw-test.yaml b/test/e2e/instance-projection/chainsaw-test.yaml new file mode 100644 index 00000000..16fa9f96 --- /dev/null +++ b/test/e2e/instance-projection/chainsaw-test.yaml @@ -0,0 +1,123 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-projection +spec: + description: | + Verifies that the InstanceProjector watches Instances written back to the + Karmada API server and creates corresponding read-only projections in the + project namespace on the control-plane cluster. + + Flow: + 1. Create a WorkloadDeployment → triggers federator → Karmada namespace created. + 2. Write an Instance to Karmada (simulating a POP-cell InstanceReconciler write-back). + 3. InstanceProjector detects the Karmada Instance and creates a projection in the + project namespace (the Chainsaw test namespace on the control-plane cluster). + 4. Assert the projection exists with the upstream tracking label and an owner + reference to the WorkloadDeployment (for cascading deletion). + + Cluster name label: "cluster-single" + The compute operator runs in single-provider mode for this e2e environment, + registering the control-plane cluster with the multicluster-runtime manager + under the name "single" (see cmd/main.go, wrappedSingleClusterProvider). + + template: true + + steps: + - name: create-wd + description: Create the WorkloadDeployment to trigger federation and namespace creation. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-downstream-namespace + description: Wait for the federated WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-projector-wd + + - name: write-instance-to-downstream + description: | + Write an Instance to Karmada simulating InstanceReconciler write-back. + Uses explicit control-plane kubeconfig to derive downstreamNS and WD UID. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get workloaddeployment test-projector-wd \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.metadata.uid}' + outputs: + - name: wdUID + value: ($stdout) + - script: + env: + - name: KARMADA_NS + value: ($downstreamNS) + - name: WD_UID + value: ($wdUID) + content: | + kubectl apply -f - < is the multicluster-runtime cluster name registered by +# wrappedSingleClusterProvider (always "single" in single-cluster mode) +# - Label meta.datumapis.com/upstream-namespace = the POP-cell namespace +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/chainsaw-test.yaml b/test/e2e/instance-writeback/chainsaw-test.yaml new file mode 100644 index 00000000..32dbbc5d --- /dev/null +++ b/test/e2e/instance-writeback/chainsaw-test.yaml @@ -0,0 +1,112 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-writeback +spec: + description: | + Verifies that the InstanceReconciler running in a POP-cell cluster writes + Instance objects back to the Karmada API server after reconciling the Ready + condition for the first time. + + Write-back convention (internal/controller/instance_controller.go): + - The Instance is written to Karmada at the same namespace/name as the POP-cell Instance. + - Label meta.datumapis.com/upstream-cluster-name is set to + "cluster-" (e.g. "cluster-compute-pop-dfw"). + - Label meta.datumapis.com/upstream-namespace records the originating namespace. + + Note: this test requires the compute operator (InstanceReconciler) to be running + in the DFW POP cell cluster. + + template: true + + steps: + - name: setup-namespaces + description: Create the Instance namespace in the DFW POP cell and Karmada. + try: + - script: + content: | + kubectl get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml apply -f - + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml apply -f - + cleanup: + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + + - name: create-instance-on-pop-dfw + description: Create the Instance on the DFW POP cell cluster. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - apply: + file: instance-pop-dfw.yaml + cleanup: + - script: + content: | + INSTANCE_NS=$(kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}') + kubectl delete instance test-writeback-instance \ + --namespace "$INSTANCE_NS" --ignore-not-found + + - name: assert-instance-in-downstream + description: Wait for the InstanceReconciler to write back the Instance to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/instance-pop-dfw.yaml b/test/e2e/instance-writeback/instance-pop-dfw.yaml new file mode 100644 index 00000000..250eb7d7 --- /dev/null +++ b/test/e2e/instance-writeback/instance-pop-dfw.yaml @@ -0,0 +1,15 @@ +# Instance created in the DFW POP cell. +# ($instanceNS) is the namespace derived from the Chainsaw test namespace UID, +# matching the ns- convention so the InstanceProjector can resolve it later. +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + name: test-writeback-instance + namespace: ($instanceNS) +spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network diff --git a/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml new file mode 100644 index 00000000..77a817a5 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml @@ -0,0 +1,6 @@ +# Asserts that the PropagationPolicy for city dfw exists in the Karmada namespace. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw diff --git a/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml new file mode 100644 index 00000000..5678c398 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml @@ -0,0 +1,133 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: propagation-policy-lifecycle +spec: + description: | + Verifies the PropagationPolicy lifecycle managed by the WorkloadDeploymentFederator: + + - A PropagationPolicy (city-dfw) is lazily created when the first WorkloadDeployment + for city code "dfw" is federated to Karmada. + - The PropagationPolicy is RETAINED while at least one WorkloadDeployment for + that city code remains in the Karmada namespace. + - The PropagationPolicy is DELETED when the last deployment for the city is removed. + + The test creates two WDs (wd-alpha, wd-beta) both targeting cityCode=dfw, verifies + the PP appears, deletes wd-alpha and asserts the PP is still present, then deletes + wd-beta and waits for the PP to disappear. + + template: true + + steps: + - name: create-deployments + description: Create two WorkloadDeployments targeting dfw on the control-plane. + try: + - apply: + file: workload-deployment-alpha.yaml + - apply: + file: workload-deployment-beta.yaml + + - name: assert-policy-created + description: | + Assert both WDs are federated to Karmada and the PropagationPolicy exists. + Both WDs must be present in Karmada before proceeding to the deletion steps; + otherwise wd-alpha's finalizer could see an empty Karmada list and prematurely + delete the PP before wd-beta has been federated. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-alpha + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-beta + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-alpha + description: Delete wd-alpha; wd-beta still targets dfw so the PP must be retained. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-alpha + + - name: assert-policy-retained + description: Assert the PropagationPolicy is still present after wd-alpha is deleted. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - sleep: + duration: 8s + - assert: + timeout: 5s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-beta + description: Delete wd-beta (the last WD for city dfw). + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-beta + + - name: assert-policy-deleted + description: Wait for the PropagationPolicy to be removed once no WDs remain. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + namespace: ($downstreamNS) + name: city-dfw + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml new file mode 100644 index 00000000..f9eb27fd --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-alpha +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml new file mode 100644 index 00000000..fd1d65c1 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-beta +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml new file mode 100644 index 00000000..98f8d0f1 --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml @@ -0,0 +1,20 @@ +# Assert the PropagationPolicy was created in the Karmada namespace. +# The name follows propagationPolicyNameFor("dfw") = "workload-deployments-dfw". +# ($downstreamNS) is substituted by Chainsaw's template engine. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw +spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml new file mode 100644 index 00000000..23c308ff --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml @@ -0,0 +1,9 @@ +# Assert the WorkloadDeployment exists in Karmada with the city-code label. +# ($downstreamNS) is substituted by Chainsaw's template engine from the script binding. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/chainsaw-test.yaml b/test/e2e/workload-deployment-federation/chainsaw-test.yaml new file mode 100644 index 00000000..302d89c4 --- /dev/null +++ b/test/e2e/workload-deployment-federation/chainsaw-test.yaml @@ -0,0 +1,84 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: workload-deployment-federation +spec: + description: | + Verifies that the WorkloadDeploymentFederator replicates a WorkloadDeployment + from the project namespace (control-plane cluster) to the Karmada API server + with the correct city-code label and PropagationPolicy. + + The federator follows the ns- convention for Karmada namespaces, + matching the MappedNamespaceResourceStrategy used by NSO. The test derives + the expected Karmada namespace dynamically from the Chainsaw test namespace UID. + + Verified: + - WorkloadDeployment exists in Karmada at ns- + - Karmada copy carries label topology.datum.net/city-code: dfw + - PropagationPolicy city-dfw exists in the Karmada namespace, + selecting WDs by city-code and routing them to matching POP-cell clusters. + + template: true + + steps: + - name: derive-ns-and-create-wd + description: Derive Karmada namespace and create the WorkloadDeployment. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeployment federated to Karmada with city-code label. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw + + - name: assert-propagation-policy-in-downstream + description: Assert PropagationPolicy created for city-dfw. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/workload-deployment.yaml b/test/e2e/workload-deployment-federation/workload-deployment.yaml new file mode 100644 index 00000000..0cd2347a --- /dev/null +++ b/test/e2e/workload-deployment-federation/workload-deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-federation-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1