tracebloc · saadqbal · Jun 22, 2026 · Jun 22, 2026
@@ -0,0 +1,102 @@
+{{- if dig "enabled" true (default dict .Values.egressReachabilityCheck) }}
+{{- /*
+  In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
+  A `helm test` hook: run `helm test <release>` to verify a normal (non-training)
+  pod in this namespace can reach the tracebloc backend API. This is the egress
+  dependency that gates EVERYTHING — the cluster authenticates to the backend to
+  obtain its Service Bus credentials, so if the backend is unreachable from inside
+  the cluster, experiments never start (they sit in Pending). It is the required-
+  egress complement to egress-enforcement-check, which verifies the opposite:
+  that *training* pods are locked OUT.
+
+  Service Bus itself is intentionally NOT probed here: its host is fetched
+  post-auth from the backend (static nowhere in the chart) and its egress is
+  brokered by the requests-proxy, whose readiness `tracebloc cluster doctor`
+  already checks. Reaching the backend is the prerequisite for both.
+
+  The probe pod is deliberately NOT labelled `tracebloc.io/workload: training`, so
+  the training-egress lockdown never selects it — it shares the egress class of
+  the jobs-manager / requests-proxy (the pods that actually reach the backend),
+  and honours the corporate proxy via tracebloc.proxyEnv, so it tests the real
+  path. As a test hook it never runs during install/upgrade, so it can never
+  block them or the hourly auto-upgrade. Set egressReachabilityCheck.enabled=false
+  to disable (e.g. a truly air-gapped cluster with no route to the backend).
+*/ -}}
+{{- $env := .Values.env.CLIENT_ENV | default "prod" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ .Release.Name }}-egress-reachability-check
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tracebloc.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": test
+    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
+spec:
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 120
+  template:
+    metadata:
+      labels:
+        {{- include "tracebloc.selectorLabels" . | nindent 8 }}
+    spec:
+      restartPolicy: Never
+      automountServiceAccountToken: false
+      securityContext:
+        runAsNonRoot: true
+        # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot
+        # can't verify that, so pin the image's uid explicitly.
+        runAsUser: 100
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: probe
+          image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }}
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            readOnlyRootFilesystem: true
+          env:
+            {{- include "tracebloc.proxyEnv" . | nindent 12 }}
+            - name: CLIENT_ENV
+              value: {{ $env | quote }}
+          command:
+            - sh
+            - -c
+            - |
+              case "$CLIENT_ENV" in
+                dev) HOST="dev-api.tracebloc.io" ;;
+                stg) HOST="stg-api.tracebloc.io" ;;
+                *)   HOST="api.tracebloc.io" ;;
+              esac
+              echo "[egress-reachability-check] CLIENT_ENV=$CLIENT_ENV -> probing backend reachability to https://${HOST}/ (through the proxy if one is configured)..."
+              # Key the verdict on curl's EXIT CODE, not the HTTP status: we only
+              # care whether the TCP+TLS connection to the backend can be made.
+              # No-egress yields connect-refused (7) or timeout (28); a DNS/proxy
+              # resolution failure (6/5) means the host can't be found. Any other
+              # outcome (0, or a TLS/HTTP-layer code) means the connection already
+              # succeeded => the backend is reachable.
+              curl -sS -m 10 -o /dev/null "https://${HOST}/"; rc=$?
+              case "$rc" in
+                5)  echo "FAIL  could not resolve the configured proxy (curl exit 5) — check HTTP_PROXY settings."; exit 1 ;;
+                6)  echo "FAIL  could not resolve ${HOST} (curl exit 6) — in-cluster DNS or egress is broken."; exit 1 ;;
+                7)  echo "FAIL  connection to ${HOST}:443 refused (curl exit 7) — no egress route to the tracebloc backend."; exit 1 ;;
+                28) echo "FAIL  connection to ${HOST}:443 timed out (curl exit 28) — egress blocked by a firewall/proxy/NetworkPolicy. Experiments can't start without backend egress."; exit 1 ;;
+              esac
+              echo "OK  backend reachable: connected to ${HOST}:443 (curl exit $rc) — the cluster can reach the tracebloc backend."
+              exit 0
+          resources:
+            requests:
+              cpu: "10m"
+              memory: "32Mi"
+            limits:
+              cpu: "100m"
+              memory: "64Mi"
+      {{- if include "tracebloc.useImagePullSecrets" . }}
+      imagePullSecrets:
+        - name: {{ include "tracebloc.registrySecretName" . }}
+      {{- end }}
+{{- end }}
diff --git a/client/tests/egress_reachability_check_test.yaml b/client/tests/egress_reachability_check_test.yaml
@@ -0,0 +1,64 @@
+suite: In-cluster backend-reachability check
+# client-runtime#116 WS3 / cli#90. A `helm test` Job that verifies a normal
+# (non-training) pod can reach the tracebloc backend — the egress dependency
+# that gates everything. These guards pin: the on/off flag, the test-hook
+# annotation (so it never runs during install/upgrade), that the probe is NOT
+# labelled as a training workload (so the lockdown netpol never selects it and
+# it keeps the jobs-manager/requests-proxy egress class), and that the probed
+# backend tracks CLIENT_ENV.
+templates:
+  - templates/egress-reachability-check.yaml
+set:
+  clientId: "test-id"
+  clientPassword: "test"
+tests:
+  - it: renders a single helm-test Job by default
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: metadata.annotations["helm.sh/hook"]
+          value: test
+
+  - it: does not render when disabled
+    set:
+      egressReachabilityCheck.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: is NOT labelled as a training workload (keeps its own egress)
+    asserts:
+      - notExists:
+          path: spec.template.metadata.labels["tracebloc.io/workload"]
+
+  - it: defaults the probe to the prod backend
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLIENT_ENV
+            value: prod
+
+  - it: targets the dev backend when CLIENT_ENV=dev
+    set:
+      env.CLIENT_ENV: dev
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLIENT_ENV
+            value: dev
+
+  - it: inherits the corporate proxy env so it tests the real egress path
+    set:
+      env.HTTP_PROXY_HOST: proxy.corp
+      env.HTTP_PROXY_PORT: "3128"
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: HTTPS_PROXY
+            value: http://proxy.corp:3128
@@ -205,6 +205,18 @@ networkPolicy:
       - "172.16.0.0/12"
       - "192.168.0.0/16"
 
+# -- In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
+# A `helm test` Job that verifies a normal (non-training) pod in this namespace
+# can reach the tracebloc backend API — the egress dependency that gates
+# everything (the cluster authenticates to the backend to obtain its Service Bus
+# credentials, so no backend egress => experiments sit in Pending). It is the
+# required-egress complement to networkPolicy.training.enforcementProbeHost
+# (which verifies the opposite: that training pods are locked out). As a test
+# hook it never runs during install/upgrade. Set enabled=false on a truly
+# air-gapped cluster with no route to the backend.
+egressReachabilityCheck:
+  enabled: true
+
 # -- Egress gateway (squid) — SECURITY §8.2 / client-runtime#102.
 # In-cluster forward proxy that lets a locked-down training pod reach an FQDN
 # allowlist (backend + App Insights) and nothing else. Labelled app=egress-proxy