Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/helm-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@ on:
- 'scripts/tests/e2e-auto-upgrade.sh'
- '.github/workflows/helm-ci.yaml'

concurrency:
# Cancel superseded runs on PRs — this is the repo's heaviest workflow
# (a real k3d cluster in upgrade-e2e plus two 4-platform matrices). Never
# cancel push/schedule runs.
group: helm-ci-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}

jobs:
lint:
timeout-minutes: 10
name: Lint
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -47,6 +55,7 @@ jobs:
run: helm lint --strict ./ingestor

template:
timeout-minutes: 10
name: Template render
runs-on: ubuntu-latest
strategy:
Expand Down Expand Up @@ -81,6 +90,7 @@ jobs:
/tmp/rendered-${{ matrix.platform }}.yaml

unittest:
timeout-minutes: 10
name: Unit tests
runs-on: ubuntu-latest
steps:
Expand All @@ -98,6 +108,7 @@ jobs:
run: helm unittest ./client

schema:
timeout-minutes: 10
name: Schema validation
runs-on: ubuntu-latest
strategy:
Expand All @@ -119,6 +130,7 @@ jobs:
echo "Schema validation passed for ${{ matrix.platform }}"

ingestor-multiarch:
timeout-minutes: 10
# Guard: the ingestor image the cluster spawns must be a multi-arch index
# (linux/amd64 + linux/arm64), or arm64 hosts (Apple Silicon, Graviton)
# fail data ingestion with "no match for platform" / ImagePullBackOff.
Expand Down Expand Up @@ -164,6 +176,7 @@ jobs:
fi

upgrade-e2e:
timeout-minutes: 30
# Fleet auto-upgrade non-regression gate (client-runtime#102 / #245-class
# regressions): installs the LAST PUBLISHED chart from gh-pages on a real
# k3d cluster, then upgrades to THIS working tree via both
Expand Down
13 changes: 13 additions & 0 deletions .github/workflows/installer-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,16 @@ on:
permissions:
contents: read

concurrency:
# Cancel superseded PR runs — this suite is expensive (9-distro
# docker-in-docker matrix + real k3d e2e + Windows Pester). Never cancel
# push/schedule runs.
group: installer-tests-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}

jobs:
static:
timeout-minutes: 10
name: Static analysis
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -73,6 +81,7 @@ jobs:
Write-Host "no PSScriptAnalyzer errors"

unit-bash:
timeout-minutes: 10
name: bats (bash unit, mocked)
runs-on: ubuntu-latest
steps:
Expand All @@ -83,6 +92,7 @@ jobs:
run: bats scripts/tests/*.bats

unit-pester:
timeout-minutes: 20
# Pester on Linux pwsh (fast) AND real Windows — the .ps1 installer's actual
# target. fail-fast:false so a Windows-only surprise doesn't mask Linux signal.
name: Pester (${{ matrix.os }})
Expand All @@ -108,6 +118,7 @@ jobs:
Invoke-Pester -Configuration $cfg

distro-prereqs:
timeout-minutes: 20
# Runs the installer's REAL Linux prerequisite path in a fresh container for
# each distro family. Proves the package-manager / Docker / conntrack / helm
# branches all resolve and install — the layer where every installer bug we
Expand Down Expand Up @@ -142,6 +153,7 @@ jobs:
bash scripts/tests/distro-prereqs.sh

e2e-cluster:
timeout-minutes: 30
# Highest-fidelity check CI can run: brings up an ACTUAL k3d cluster via the
# installer's own create_cluster() on a real kernel (Docker is preinstalled
# on the runner), proves it can schedule + run a public workload, then tears
Expand All @@ -160,6 +172,7 @@ jobs:
run: bash scripts/tests/e2e-cluster.sh

e2e-proxy:
timeout-minutes: 30
# Authenticated corporate-proxy E2E (the Charité/hospital archetype): stands
# up a squid that REQUIRES basic auth, brings the cluster up with the
# installer's proxy config pointed at it as user:pass@host, and proves the
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/public-pii-gate-caller.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Public PII gate

# Per-repo caller for the public-repo PII gate. Blocks PRs whose title/body/
# commits contain a denylisted customer/partner name or known secret.
# Logic lives in tracebloc/.github/.github/workflows/public-pii-gate.yml.

on:
pull_request:
types: [opened, edited, reopened, synchronize, labeled, unlabeled]

jobs:
pii-gate:
uses: tracebloc/.github/.github/workflows/public-pii-gate.yml@main
secrets: inherit
4 changes: 2 additions & 2 deletions client/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ apiVersion: v2
name: client
description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift
type: application
version: 1.7.1
appVersion: "1.7.1"
version: 1.8.0
appVersion: "1.8.0"
keywords:
- tracebloc
- kubernetes
Expand Down
14 changes: 14 additions & 0 deletions client/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,20 @@ client-pvc
{{ .Values.pvc.data | default "50Gi" }}
{{- end }}

{{/*
hostPath base for the DATASET (shared-images) PV ONLY. Defaults to the
historical local path /tracebloc so installs without a network dataset mount
render byte-identically. When the installer bind-mounts a customer network
(NFS) dir at /tracebloc-data (HOST_DATASET_DIR set), it passes
hostPath.datasetPath=/tracebloc-data to relocate datasets onto that mount,
while mysql + logs ALWAYS stay on the local /tracebloc tree (InnoDB over NFS
is unsafe — backend#743). The /<release>/data suffix is appended here.
Nil-guarded (default dict) for `--reuse-values` upgrades predating this key.
*/}}
{{- define "tracebloc.clientDataHostPath" -}}
{{ printf "%s/%s/data" ((default dict .Values.hostPath).datasetPath | default "/tracebloc") .Release.Name }}
{{- end -}}

{{- define "tracebloc.clientLogsPvc" -}}
client-logs-pvc
{{- end }}
Expand Down
106 changes: 106 additions & 0 deletions client/templates/egress-reachability-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{{- if dig "enabled" true (default dict .Values.egressReachabilityCheck) }}
{{- /*
In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
A `helm test` hook: run `helm test <release>` to verify a normal (non-training)
pod in this namespace can reach the tracebloc backend API. This is the egress
dependency that gates EVERYTHING — the cluster authenticates to the backend to
obtain its Service Bus credentials, so if the backend is unreachable from inside
the cluster, experiments never start (they sit in Pending). It is the required-
egress complement to egress-enforcement-check, which verifies the opposite:
that *training* pods are locked OUT.

Service Bus itself is intentionally NOT probed here: its host is fetched
post-auth from the backend (static nowhere in the chart) and its egress is
brokered by the requests-proxy, whose readiness `tracebloc cluster doctor`
already checks. Reaching the backend is the prerequisite for both.

The probe pod is deliberately NOT labelled `tracebloc.io/workload: training`, so
the training-egress lockdown never selects it — it shares the egress class of
the jobs-manager / requests-proxy (the pods that actually reach the backend),
and honours the corporate proxy via tracebloc.proxyEnv, so it tests the real
path. As a test hook it never runs during install/upgrade, so it can never
block them or the hourly auto-upgrade. Set egressReachabilityCheck.enabled=false
to disable (e.g. a truly air-gapped cluster with no route to the backend).
*/ -}}
{{- $env := .Values.env.CLIENT_ENV | default "prod" -}}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ .Release.Name }}-egress-reachability-check
namespace: {{ .Release.Namespace }}
labels:
{{- include "tracebloc.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": test
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 120
template:
metadata:
labels:
{{- include "tracebloc.selectorLabels" . | nindent 8 }}
spec:
restartPolicy: Never
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
# curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot
# can't verify that, so pin the image's uid explicitly.
runAsUser: 100
seccompProfile:
type: RuntimeDefault
containers:
- name: probe
image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }}
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
env:
{{- include "tracebloc.proxyEnv" . | nindent 12 }}
- name: CLIENT_ENV
value: {{ $env | quote }}
command:
- sh
- -c
- |
case "$CLIENT_ENV" in
dev) HOST="dev-api.tracebloc.io" ;;
stg) HOST="stg-api.tracebloc.io" ;;
*) HOST="api.tracebloc.io" ;;
esac
echo "[egress-reachability-check] CLIENT_ENV=$CLIENT_ENV -> probing backend reachability to https://${HOST}/ (through the proxy if one is configured)..."
# Key the verdict on curl's EXIT CODE, not the HTTP status. With no
# --fail, curl exits 0 for ANY HTTP response (200/401/404/5xx), so a
# 0 exit is the only outcome that proves a full TCP+TLS+HTTP round
# trip — the backend is genuinely reachable AND usable. Every
# non-zero code is a transport/TLS failure, so we fail closed on it:
# connect-refused (7) / timeout (28) = egress blocked; DNS/proxy
# resolve (6/5) = host can't be found; a TLS handshake/cert error
# (35/51/60/…) = reached the host but TLS is broken (commonly a proxy
# intercepting TLS with an untrusted CA), so the API is unusable.
curl -sS -m 10 -o /dev/null "https://${HOST}/"; rc=$?
case "$rc" in
0) echo "OK backend reachable: completed an HTTPS request to ${HOST}:443 (curl exit 0) — the cluster can reach the tracebloc backend."; exit 0 ;;
5) echo "FAIL could not resolve the configured proxy (curl exit 5) — check HTTP_PROXY settings."; exit 1 ;;
6) echo "FAIL could not resolve ${HOST} (curl exit 6) — in-cluster DNS or egress is broken."; exit 1 ;;
7) echo "FAIL connection to ${HOST}:443 refused (curl exit 7) — no egress route to the tracebloc backend."; exit 1 ;;
28) echo "FAIL connection to ${HOST}:443 timed out (curl exit 28) — egress blocked by a firewall/proxy/NetworkPolicy. Experiments can't start without backend egress."; exit 1 ;;
35|51|53|58|59|60|66|77|83|91) echo "FAIL reached ${HOST}:443 but the TLS handshake/certificate check failed (curl exit $rc) — commonly a corporate proxy intercepting TLS with a CA the cluster doesn't trust. The backend API is unusable until that CA is trusted; experiments can't authenticate."; exit 1 ;;
*) echo "FAIL could not complete an HTTPS request to ${HOST}:443 (curl exit $rc) — backend egress is not working."; exit 1 ;;
esac
resources:
requests:
cpu: "10m"
memory: "32Mi"
limits:
cpu: "100m"
memory: "64Mi"
{{- if include "tracebloc.useImagePullSecrets" . }}
imagePullSecrets:
- name: {{ include "tracebloc.registrySecretName" . }}
{{- end }}
{{- end }}
2 changes: 1 addition & 1 deletion client/templates/shared-images-pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
name: {{ $name }}
namespace: {{ .Release.Namespace }}
hostPath:
path: /tracebloc/{{ .Release.Name }}/data
path: {{ include "tracebloc.clientDataHostPath" . }}
type: DirectoryOrCreate
---
{{- end }}
Expand Down
64 changes: 64 additions & 0 deletions client/tests/egress_reachability_check_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
suite: In-cluster backend-reachability check
# client-runtime#116 WS3 / cli#90. A `helm test` Job that verifies a normal
# (non-training) pod can reach the tracebloc backend — the egress dependency
# that gates everything. These guards pin: the on/off flag, the test-hook
# annotation (so it never runs during install/upgrade), that the probe is NOT
# labelled as a training workload (so the lockdown netpol never selects it and
# it keeps the jobs-manager/requests-proxy egress class), and that the probed
# backend tracks CLIENT_ENV.
templates:
- templates/egress-reachability-check.yaml
set:
clientId: "test-id"
clientPassword: "test"
tests:
- it: renders a single helm-test Job by default
asserts:
- hasDocuments:
count: 1
- isKind:
of: Job
- equal:
path: metadata.annotations["helm.sh/hook"]
value: test

- it: does not render when disabled
set:
egressReachabilityCheck.enabled: false
asserts:
- hasDocuments:
count: 0

- it: is NOT labelled as a training workload (keeps its own egress)
asserts:
- notExists:
path: spec.template.metadata.labels["tracebloc.io/workload"]

- it: defaults the probe to the prod backend
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: CLIENT_ENV
value: prod

- it: targets the dev backend when CLIENT_ENV=dev
set:
env.CLIENT_ENV: dev
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: CLIENT_ENV
value: dev

- it: inherits the corporate proxy env so it tests the real egress path
set:
env.HTTP_PROXY_HOST: proxy.corp
env.HTTP_PROXY_PORT: "3128"
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: HTTPS_PROXY
value: http://proxy.corp:3128
53 changes: 53 additions & 0 deletions client/tests/jobs_manager_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,56 @@ tests:
value: "false"
count: 1

# tracebloc/backend#745: the per-spawned-training-job resource request/limit
# the jobs-manager hands to each Job it creates. The chart is the single
# effective source of truth — it ALWAYS injects RESOURCE_REQUESTS /
# RESOURCE_LIMITS, so client-runtime's jobs_manager.py only falls back to its
# own default when they are absent. Pin the rendered value here so the two
# cannot silently diverge again, on BOTH containers that receive it
# (api + pods-monitor).
- it: defaults spawned-job RESOURCE_REQUESTS/LIMITS to cpu=2,memory=8Gi (req==limit => Guaranteed QoS)
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: RESOURCE_REQUESTS
value: "cpu=2,memory=8Gi"
count: 1
- contains:
path: spec.template.spec.containers[0].env
content:
name: RESOURCE_LIMITS
value: "cpu=2,memory=8Gi"
count: 1
- contains:
path: spec.template.spec.containers[1].env
content:
name: RESOURCE_REQUESTS
value: "cpu=2,memory=8Gi"
count: 1
- contains:
path: spec.template.spec.containers[1].env
content:
name: RESOURCE_LIMITS
value: "cpu=2,memory=8Gi"
count: 1

- it: lets operators override spawned-job resources via env.RESOURCE_REQUESTS/LIMITS
set:
env:
RESOURCE_REQUESTS: "cpu=4,memory=16Gi"
RESOURCE_LIMITS: "cpu=4,memory=16Gi"
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: RESOURCE_REQUESTS
value: "cpu=4,memory=16Gi"
count: 1
- contains:
path: spec.template.spec.containers[0].env
content:
name: RESOURCE_LIMITS
value: "cpu=4,memory=16Gi"
count: 1

Loading
Loading