diff --git a/client/templates/egress-reachability-check.yaml b/client/templates/egress-reachability-check.yaml new file mode 100644 index 0000000..9fbf82e --- /dev/null +++ b/client/templates/egress-reachability-check.yaml @@ -0,0 +1,102 @@ +{{- if dig "enabled" true (default dict .Values.egressReachabilityCheck) }} +{{- /* + In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90). + A `helm test` hook: run `helm test ` to verify a normal (non-training) + pod in this namespace can reach the tracebloc backend API. This is the egress + dependency that gates EVERYTHING — the cluster authenticates to the backend to + obtain its Service Bus credentials, so if the backend is unreachable from inside + the cluster, experiments never start (they sit in Pending). It is the required- + egress complement to egress-enforcement-check, which verifies the opposite: + that *training* pods are locked OUT. + + Service Bus itself is intentionally NOT probed here: its host is fetched + post-auth from the backend (static nowhere in the chart) and its egress is + brokered by the requests-proxy, whose readiness `tracebloc cluster doctor` + already checks. Reaching the backend is the prerequisite for both. + + The probe pod is deliberately NOT labelled `tracebloc.io/workload: training`, so + the training-egress lockdown never selects it — it shares the egress class of + the jobs-manager / requests-proxy (the pods that actually reach the backend), + and honours the corporate proxy via tracebloc.proxyEnv, so it tests the real + path. As a test hook it never runs during install/upgrade, so it can never + block them or the hourly auto-upgrade. Set egressReachabilityCheck.enabled=false + to disable (e.g. a truly air-gapped cluster with no route to the backend). +*/ -}} +{{- $env := .Values.env.CLIENT_ENV | default "prod" -}} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-egress-reachability-check + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + {{- include "tracebloc.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: Never + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot + # can't verify that, so pin the image's uid explicitly. + runAsUser: 100 + seccompProfile: + type: RuntimeDefault + containers: + - name: probe + image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + env: + {{- include "tracebloc.proxyEnv" . | nindent 12 }} + - name: CLIENT_ENV + value: {{ $env | quote }} + command: + - sh + - -c + - | + case "$CLIENT_ENV" in + dev) HOST="dev-api.tracebloc.io" ;; + stg) HOST="stg-api.tracebloc.io" ;; + *) HOST="api.tracebloc.io" ;; + esac + echo "[egress-reachability-check] CLIENT_ENV=$CLIENT_ENV -> probing backend reachability to https://${HOST}/ (through the proxy if one is configured)..." + # Key the verdict on curl's EXIT CODE, not the HTTP status: we only + # care whether the TCP+TLS connection to the backend can be made. + # No-egress yields connect-refused (7) or timeout (28); a DNS/proxy + # resolution failure (6/5) means the host can't be found. Any other + # outcome (0, or a TLS/HTTP-layer code) means the connection already + # succeeded => the backend is reachable. + curl -sS -m 10 -o /dev/null "https://${HOST}/"; rc=$? + case "$rc" in + 5) echo "FAIL could not resolve the configured proxy (curl exit 5) — check HTTP_PROXY settings."; exit 1 ;; + 6) echo "FAIL could not resolve ${HOST} (curl exit 6) — in-cluster DNS or egress is broken."; exit 1 ;; + 7) echo "FAIL connection to ${HOST}:443 refused (curl exit 7) — no egress route to the tracebloc backend."; exit 1 ;; + 28) echo "FAIL connection to ${HOST}:443 timed out (curl exit 28) — egress blocked by a firewall/proxy/NetworkPolicy. Experiments can't start without backend egress."; exit 1 ;; + esac + echo "OK backend reachable: connected to ${HOST}:443 (curl exit $rc) — the cluster can reach the tracebloc backend." + exit 0 + resources: + requests: + cpu: "10m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + {{- if include "tracebloc.useImagePullSecrets" . }} + imagePullSecrets: + - name: {{ include "tracebloc.registrySecretName" . }} + {{- end }} +{{- end }} diff --git a/client/tests/egress_reachability_check_test.yaml b/client/tests/egress_reachability_check_test.yaml new file mode 100644 index 0000000..268ead8 --- /dev/null +++ b/client/tests/egress_reachability_check_test.yaml @@ -0,0 +1,64 @@ +suite: In-cluster backend-reachability check +# client-runtime#116 WS3 / cli#90. A `helm test` Job that verifies a normal +# (non-training) pod can reach the tracebloc backend — the egress dependency +# that gates everything. These guards pin: the on/off flag, the test-hook +# annotation (so it never runs during install/upgrade), that the probe is NOT +# labelled as a training workload (so the lockdown netpol never selects it and +# it keeps the jobs-manager/requests-proxy egress class), and that the probed +# backend tracks CLIENT_ENV. +templates: + - templates/egress-reachability-check.yaml +set: + clientId: "test-id" + clientPassword: "test" +tests: + - it: renders a single helm-test Job by default + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Job + - equal: + path: metadata.annotations["helm.sh/hook"] + value: test + + - it: does not render when disabled + set: + egressReachabilityCheck.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: is NOT labelled as a training workload (keeps its own egress) + asserts: + - notExists: + path: spec.template.metadata.labels["tracebloc.io/workload"] + + - it: defaults the probe to the prod backend + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: CLIENT_ENV + value: prod + + - it: targets the dev backend when CLIENT_ENV=dev + set: + env.CLIENT_ENV: dev + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: CLIENT_ENV + value: dev + + - it: inherits the corporate proxy env so it tests the real egress path + set: + env.HTTP_PROXY_HOST: proxy.corp + env.HTTP_PROXY_PORT: "3128" + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: HTTPS_PROXY + value: http://proxy.corp:3128 diff --git a/client/values.yaml b/client/values.yaml index 091552e..706cec7 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -205,6 +205,18 @@ networkPolicy: - "172.16.0.0/12" - "192.168.0.0/16" +# -- In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90). +# A `helm test` Job that verifies a normal (non-training) pod in this namespace +# can reach the tracebloc backend API — the egress dependency that gates +# everything (the cluster authenticates to the backend to obtain its Service Bus +# credentials, so no backend egress => experiments sit in Pending). It is the +# required-egress complement to networkPolicy.training.enforcementProbeHost +# (which verifies the opposite: that training pods are locked out). As a test +# hook it never runs during install/upgrade. Set enabled=false on a truly +# air-gapped cluster with no route to the backend. +egressReachabilityCheck: + enabled: true + # -- Egress gateway (squid) — SECURITY §8.2 / client-runtime#102. # In-cluster forward proxy that lets a locked-down training pod reach an FQDN # allowlist (backend + App Insights) and nothing else. Labelled app=egress-proxy