Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions client/templates/egress-reachability-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{{- if dig "enabled" true (default dict .Values.egressReachabilityCheck) }}
{{- /*
In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
A `helm test` hook: run `helm test <release>` to verify a normal (non-training)
pod in this namespace can reach the tracebloc backend API. This is the egress
dependency that gates EVERYTHING — the cluster authenticates to the backend to
obtain its Service Bus credentials, so if the backend is unreachable from inside
the cluster, experiments never start (they sit in Pending). It is the required-
egress complement to egress-enforcement-check, which verifies the opposite:
that *training* pods are locked OUT.

Service Bus itself is intentionally NOT probed here: its host is fetched
post-auth from the backend (static nowhere in the chart) and its egress is
brokered by the requests-proxy, whose readiness `tracebloc cluster doctor`
already checks. Reaching the backend is the prerequisite for both.

The probe pod is deliberately NOT labelled `tracebloc.io/workload: training`, so
the training-egress lockdown never selects it — it shares the egress class of
the jobs-manager / requests-proxy (the pods that actually reach the backend),
and honours the corporate proxy via tracebloc.proxyEnv, so it tests the real
path. As a test hook it never runs during install/upgrade, so it can never
block them or the hourly auto-upgrade. Set egressReachabilityCheck.enabled=false
to disable (e.g. a truly air-gapped cluster with no route to the backend).
*/ -}}
{{- $env := .Values.env.CLIENT_ENV | default "prod" -}}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ .Release.Name }}-egress-reachability-check
namespace: {{ .Release.Namespace }}
labels:
{{- include "tracebloc.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": test
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 120
template:
metadata:
labels:
{{- include "tracebloc.selectorLabels" . | nindent 8 }}
spec:
restartPolicy: Never
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
# curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot
# can't verify that, so pin the image's uid explicitly.
runAsUser: 100
seccompProfile:
type: RuntimeDefault
containers:
- name: probe
image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }}
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
env:
{{- include "tracebloc.proxyEnv" . | nindent 12 }}
- name: CLIENT_ENV
value: {{ $env | quote }}
command:
- sh
- -c
- |
case "$CLIENT_ENV" in
dev) HOST="dev-api.tracebloc.io" ;;
stg) HOST="stg-api.tracebloc.io" ;;
*) HOST="api.tracebloc.io" ;;
esac
echo "[egress-reachability-check] CLIENT_ENV=$CLIENT_ENV -> probing backend reachability to https://${HOST}/ (through the proxy if one is configured)..."
# Key the verdict on curl's EXIT CODE, not the HTTP status: we only
# care whether the TCP+TLS connection to the backend can be made.
# No-egress yields connect-refused (7) or timeout (28); a DNS/proxy
# resolution failure (6/5) means the host can't be found. Any other
# outcome (0, or a TLS/HTTP-layer code) means the connection already
# succeeded => the backend is reachable.
curl -sS -m 10 -o /dev/null "https://${HOST}/"; rc=$?
case "$rc" in
5) echo "FAIL could not resolve the configured proxy (curl exit 5) — check HTTP_PROXY settings."; exit 1 ;;
6) echo "FAIL could not resolve ${HOST} (curl exit 6) — in-cluster DNS or egress is broken."; exit 1 ;;
7) echo "FAIL connection to ${HOST}:443 refused (curl exit 7) — no egress route to the tracebloc backend."; exit 1 ;;
28) echo "FAIL connection to ${HOST}:443 timed out (curl exit 28) — egress blocked by a firewall/proxy/NetworkPolicy. Experiments can't start without backend egress."; exit 1 ;;
esac
echo "OK backend reachable: connected to ${HOST}:443 (curl exit $rc) — the cluster can reach the tracebloc backend."
exit 0
resources:
requests:
cpu: "10m"
memory: "32Mi"
limits:
cpu: "100m"
memory: "64Mi"
{{- if include "tracebloc.useImagePullSecrets" . }}
imagePullSecrets:
- name: {{ include "tracebloc.registrySecretName" . }}
{{- end }}
{{- end }}
64 changes: 64 additions & 0 deletions client/tests/egress_reachability_check_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
suite: In-cluster backend-reachability check
# client-runtime#116 WS3 / cli#90. A `helm test` Job that verifies a normal
# (non-training) pod can reach the tracebloc backend — the egress dependency
# that gates everything. These guards pin: the on/off flag, the test-hook
# annotation (so it never runs during install/upgrade), that the probe is NOT
# labelled as a training workload (so the lockdown netpol never selects it and
# it keeps the jobs-manager/requests-proxy egress class), and that the probed
# backend tracks CLIENT_ENV.
templates:
- templates/egress-reachability-check.yaml
set:
clientId: "test-id"
clientPassword: "test"
tests:
- it: renders a single helm-test Job by default
asserts:
- hasDocuments:
count: 1
- isKind:
of: Job
- equal:
path: metadata.annotations["helm.sh/hook"]
value: test

- it: does not render when disabled
set:
egressReachabilityCheck.enabled: false
asserts:
- hasDocuments:
count: 0

- it: is NOT labelled as a training workload (keeps its own egress)
asserts:
- notExists:
path: spec.template.metadata.labels["tracebloc.io/workload"]

- it: defaults the probe to the prod backend
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: CLIENT_ENV
value: prod

- it: targets the dev backend when CLIENT_ENV=dev
set:
env.CLIENT_ENV: dev
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: CLIENT_ENV
value: dev

- it: inherits the corporate proxy env so it tests the real egress path
set:
env.HTTP_PROXY_HOST: proxy.corp
env.HTTP_PROXY_PORT: "3128"
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: HTTPS_PROXY
value: http://proxy.corp:3128
12 changes: 12 additions & 0 deletions client/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ networkPolicy:
- "172.16.0.0/12"
- "192.168.0.0/16"

# -- In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
# A `helm test` Job that verifies a normal (non-training) pod in this namespace
# can reach the tracebloc backend API — the egress dependency that gates
# everything (the cluster authenticates to the backend to obtain its Service Bus
# credentials, so no backend egress => experiments sit in Pending). It is the
# required-egress complement to networkPolicy.training.enforcementProbeHost
# (which verifies the opposite: that training pods are locked out). As a test
# hook it never runs during install/upgrade. Set enabled=false on a truly
# air-gapped cluster with no route to the backend.
egressReachabilityCheck:
enabled: true

# -- Egress gateway (squid) — SECURITY §8.2 / client-runtime#102.
# In-cluster forward proxy that lets a locked-down training pod reach an FQDN
# allowlist (backend + App Insights) and nothing else. Labelled app=egress-proxy
Expand Down
Loading