tracebloc · saadqbal · Jun 23, 2026 · Jun 16, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.github/workflows/helm-ci.yaml b/.github/workflows/helm-ci.yaml
@@ -16,8 +16,16 @@ on:
       - 'scripts/tests/e2e-auto-upgrade.sh'
       - '.github/workflows/helm-ci.yaml'
 
+concurrency:
+  # Cancel superseded runs on PRs — this is the repo's heaviest workflow
+  # (a real k3d cluster in upgrade-e2e plus two 4-platform matrices). Never
+  # cancel push/schedule runs.
+  group: helm-ci-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   lint:
+    timeout-minutes: 10
     name: Lint
     runs-on: ubuntu-latest
     steps:
@@ -47,6 +55,7 @@ jobs:
         run: helm lint --strict ./ingestor
 
   template:
+    timeout-minutes: 10
     name: Template render
     runs-on: ubuntu-latest
     strategy:
@@ -81,6 +90,7 @@ jobs:
             /tmp/rendered-${{ matrix.platform }}.yaml
 
   unittest:
+    timeout-minutes: 10
     name: Unit tests
     runs-on: ubuntu-latest
     steps:
@@ -98,6 +108,7 @@ jobs:
         run: helm unittest ./client
 
   schema:
+    timeout-minutes: 10
     name: Schema validation
     runs-on: ubuntu-latest
     strategy:
@@ -119,6 +130,7 @@ jobs:
           echo "Schema validation passed for ${{ matrix.platform }}"
 
   ingestor-multiarch:
+    timeout-minutes: 10
     # Guard: the ingestor image the cluster spawns must be a multi-arch index
     # (linux/amd64 + linux/arm64), or arm64 hosts (Apple Silicon, Graviton)
     # fail data ingestion with "no match for platform" / ImagePullBackOff.
@@ -164,6 +176,7 @@ jobs:
           fi
 
   upgrade-e2e:
+    timeout-minutes: 30
     # Fleet auto-upgrade non-regression gate (client-runtime#102 / #245-class
     # regressions): installs the LAST PUBLISHED chart from gh-pages on a real
     # k3d cluster, then upgrades to THIS working tree via both

diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml
@@ -28,8 +28,16 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  # Cancel superseded PR runs — this suite is expensive (9-distro
+  # docker-in-docker matrix + real k3d e2e + Windows Pester). Never cancel
+  # push/schedule runs.
+  group: installer-tests-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   static:
+    timeout-minutes: 10
     name: Static analysis
     runs-on: ubuntu-latest
     steps:
@@ -73,6 +81,7 @@ jobs:
           Write-Host "no PSScriptAnalyzer errors"
 
   unit-bash:
+    timeout-minutes: 10
     name: bats (bash unit, mocked)
     runs-on: ubuntu-latest
     steps:
@@ -83,6 +92,7 @@ jobs:
         run: bats scripts/tests/*.bats
 
   unit-pester:
+    timeout-minutes: 20
     # Pester on Linux pwsh (fast) AND real Windows — the .ps1 installer's actual
     # target. fail-fast:false so a Windows-only surprise doesn't mask Linux signal.
     name: Pester (${{ matrix.os }})
@@ -108,6 +118,7 @@ jobs:
           Invoke-Pester -Configuration $cfg
 
   distro-prereqs:
+    timeout-minutes: 20
     # Runs the installer's REAL Linux prerequisite path in a fresh container for
     # each distro family. Proves the package-manager / Docker / conntrack / helm
     # branches all resolve and install — the layer where every installer bug we
@@ -142,6 +153,7 @@ jobs:
             bash scripts/tests/distro-prereqs.sh
 
   e2e-cluster:
+    timeout-minutes: 30
     # Highest-fidelity check CI can run: brings up an ACTUAL k3d cluster via the
     # installer's own create_cluster() on a real kernel (Docker is preinstalled
     # on the runner), proves it can schedule + run a public workload, then tears
@@ -160,6 +172,7 @@ jobs:
         run: bash scripts/tests/e2e-cluster.sh
 
   e2e-proxy:
+    timeout-minutes: 30
     # Authenticated corporate-proxy E2E (the Charité/hospital archetype): stands
     # up a squid that REQUIRES basic auth, brings the cluster up with the
     # installer's proxy config pointed at it as user:pass@host, and proves the

diff --git a/.github/workflows/public-pii-gate-caller.yml b/.github/workflows/public-pii-gate-caller.yml
@@ -0,0 +1,14 @@
+name: Public PII gate
+
+# Per-repo caller for the public-repo PII gate. Blocks PRs whose title/body/
+# commits contain a denylisted customer/partner name or known secret.
+# Logic lives in tracebloc/.github/.github/workflows/public-pii-gate.yml.
+
+on:
+  pull_request:
+    types: [opened, edited, reopened, synchronize, labeled, unlabeled]
+
+jobs:
+  pii-gate:
+    uses: tracebloc/.github/.github/workflows/public-pii-gate.yml@main
+    secrets: inherit
diff --git a/client/Chart.yaml b/client/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: client
 description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift
 type: application
-version: 1.7.1
-appVersion: "1.7.1"
+version: 1.8.0
+appVersion: "1.8.0"
 keywords:
   - tracebloc
   - kubernetes

@@ -67,6 +67,20 @@ client-pvc
 {{ .Values.pvc.data | default "50Gi" }}
 {{- end }}
 
+{{/*
+  hostPath base for the DATASET (shared-images) PV ONLY. Defaults to the
+  historical local path /tracebloc so installs without a network dataset mount
+  render byte-identically. When the installer bind-mounts a customer network
+  (NFS) dir at /tracebloc-data (HOST_DATASET_DIR set), it passes
+  hostPath.datasetPath=/tracebloc-data to relocate datasets onto that mount,
+  while mysql + logs ALWAYS stay on the local /tracebloc tree (InnoDB over NFS
+  is unsafe — backend#743). The /<release>/data suffix is appended here.
+  Nil-guarded (default dict) for `--reuse-values` upgrades predating this key.
+*/}}
+{{- define "tracebloc.clientDataHostPath" -}}
+{{ printf "%s/%s/data" ((default dict .Values.hostPath).datasetPath | default "/tracebloc") .Release.Name }}
+{{- end -}}
+
 {{- define "tracebloc.clientLogsPvc" -}}
 client-logs-pvc
 {{- end }}

@@ -0,0 +1,106 @@
+{{- if dig "enabled" true (default dict .Values.egressReachabilityCheck) }}
+{{- /*
+  In-cluster backend-reachability check (client-runtime#116 WS3 / cli#90).
+  A `helm test` hook: run `helm test <release>` to verify a normal (non-training)
+  pod in this namespace can reach the tracebloc backend API. This is the egress
+  dependency that gates EVERYTHING — the cluster authenticates to the backend to
+  obtain its Service Bus credentials, so if the backend is unreachable from inside
+  the cluster, experiments never start (they sit in Pending). It is the required-
+  egress complement to egress-enforcement-check, which verifies the opposite:
+  that *training* pods are locked OUT.
+
+  Service Bus itself is intentionally NOT probed here: its host is fetched
+  post-auth from the backend (static nowhere in the chart) and its egress is
+  brokered by the requests-proxy, whose readiness `tracebloc cluster doctor`
+  already checks. Reaching the backend is the prerequisite for both.
+
+  The probe pod is deliberately NOT labelled `tracebloc.io/workload: training`, so
+  the training-egress lockdown never selects it — it shares the egress class of
+  the jobs-manager / requests-proxy (the pods that actually reach the backend),
+  and honours the corporate proxy via tracebloc.proxyEnv, so it tests the real
+  path. As a test hook it never runs during install/upgrade, so it can never
+  block them or the hourly auto-upgrade. Set egressReachabilityCheck.enabled=false
+  to disable (e.g. a truly air-gapped cluster with no route to the backend).
+*/ -}}
+{{- $env := .Values.env.CLIENT_ENV | default "prod" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ .Release.Name }}-egress-reachability-check
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tracebloc.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": test
+    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
+spec:
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 120
+  template:
+    metadata:
+      labels:
+        {{- include "tracebloc.selectorLabels" . | nindent 8 }}
+    spec:
+      restartPolicy: Never
+      automountServiceAccountToken: false
+      securityContext:
+        runAsNonRoot: true
+        # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot
+        # can't verify that, so pin the image's uid explicitly.
+        runAsUser: 100
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: probe
+          image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }}
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            readOnlyRootFilesystem: true
+          env:
+            {{- include "tracebloc.proxyEnv" . | nindent 12 }}
+            - name: CLIENT_ENV
+              value: {{ $env | quote }}
+          command:
+            - sh
+            - -c
+            - |
+              case "$CLIENT_ENV" in
+                dev) HOST="dev-api.tracebloc.io" ;;
+                stg) HOST="stg-api.tracebloc.io" ;;
+                *)   HOST="api.tracebloc.io" ;;
+              esac
+              echo "[egress-reachability-check] CLIENT_ENV=$CLIENT_ENV -> probing backend reachability to https://${HOST}/ (through the proxy if one is configured)..."
+              # Key the verdict on curl's EXIT CODE, not the HTTP status. With no
+              # --fail, curl exits 0 for ANY HTTP response (200/401/404/5xx), so a
+              # 0 exit is the only outcome that proves a full TCP+TLS+HTTP round
+              # trip — the backend is genuinely reachable AND usable. Every
+              # non-zero code is a transport/TLS failure, so we fail closed on it:
+              # connect-refused (7) / timeout (28) = egress blocked; DNS/proxy
+              # resolve (6/5) = host can't be found; a TLS handshake/cert error
+              # (35/51/60/…) = reached the host but TLS is broken (commonly a proxy
+              # intercepting TLS with an untrusted CA), so the API is unusable.
+              curl -sS -m 10 -o /dev/null "https://${HOST}/"; rc=$?
+              case "$rc" in
+                0)  echo "OK  backend reachable: completed an HTTPS request to ${HOST}:443 (curl exit 0) — the cluster can reach the tracebloc backend."; exit 0 ;;
+                5)  echo "FAIL  could not resolve the configured proxy (curl exit 5) — check HTTP_PROXY settings."; exit 1 ;;
+                6)  echo "FAIL  could not resolve ${HOST} (curl exit 6) — in-cluster DNS or egress is broken."; exit 1 ;;
+                7)  echo "FAIL  connection to ${HOST}:443 refused (curl exit 7) — no egress route to the tracebloc backend."; exit 1 ;;
+                28) echo "FAIL  connection to ${HOST}:443 timed out (curl exit 28) — egress blocked by a firewall/proxy/NetworkPolicy. Experiments can't start without backend egress."; exit 1 ;;
+                35|51|53|58|59|60|66|77|83|91) echo "FAIL  reached ${HOST}:443 but the TLS handshake/certificate check failed (curl exit $rc) — commonly a corporate proxy intercepting TLS with a CA the cluster doesn't trust. The backend API is unusable until that CA is trusted; experiments can't authenticate."; exit 1 ;;
+                *)  echo "FAIL  could not complete an HTTPS request to ${HOST}:443 (curl exit $rc) — backend egress is not working."; exit 1 ;;
+              esac
+          resources:
+            requests:
+              cpu: "10m"
+              memory: "32Mi"
+            limits:
+              cpu: "100m"
+              memory: "64Mi"
+      {{- if include "tracebloc.useImagePullSecrets" . }}
+      imagePullSecrets:
+        - name: {{ include "tracebloc.registrySecretName" . }}
+      {{- end }}
+{{- end }}
@@ -17,7 +17,7 @@ spec:
     name: {{ $name }}
     namespace: {{ .Release.Namespace }}
   hostPath:
-    path: /tracebloc/{{ .Release.Name }}/data
+    path: {{ include "tracebloc.clientDataHostPath" . }}
     type: DirectoryOrCreate
 ---
 {{- end }}

diff --git a/client/tests/egress_reachability_check_test.yaml b/client/tests/egress_reachability_check_test.yaml
@@ -0,0 +1,64 @@
+suite: In-cluster backend-reachability check
+# client-runtime#116 WS3 / cli#90. A `helm test` Job that verifies a normal
+# (non-training) pod can reach the tracebloc backend — the egress dependency
+# that gates everything. These guards pin: the on/off flag, the test-hook
+# annotation (so it never runs during install/upgrade), that the probe is NOT
+# labelled as a training workload (so the lockdown netpol never selects it and
+# it keeps the jobs-manager/requests-proxy egress class), and that the probed
+# backend tracks CLIENT_ENV.
+templates:
+  - templates/egress-reachability-check.yaml
+set:
+  clientId: "test-id"
+  clientPassword: "test"
+tests:
+  - it: renders a single helm-test Job by default
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: metadata.annotations["helm.sh/hook"]
+          value: test
+
+  - it: does not render when disabled
+    set:
+      egressReachabilityCheck.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: is NOT labelled as a training workload (keeps its own egress)
+    asserts:
+      - notExists:
+          path: spec.template.metadata.labels["tracebloc.io/workload"]
+
+  - it: defaults the probe to the prod backend
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLIENT_ENV
+            value: prod
+
+  - it: targets the dev backend when CLIENT_ENV=dev
+    set:
+      env.CLIENT_ENV: dev
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLIENT_ENV
+            value: dev
+
+  - it: inherits the corporate proxy env so it tests the real egress path
+    set:
+      env.HTTP_PROXY_HOST: proxy.corp
+      env.HTTP_PROXY_PORT: "3128"
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: HTTPS_PROXY
+            value: http://proxy.corp:3128
diff --git a/client/tests/jobs_manager_test.yaml b/client/tests/jobs_manager_test.yaml
@@ -183,3 +183,56 @@ tests:
             value: "false"
           count: 1
 
+  # tracebloc/backend#745: the per-spawned-training-job resource request/limit
+  # the jobs-manager hands to each Job it creates. The chart is the single
+  # effective source of truth — it ALWAYS injects RESOURCE_REQUESTS /
+  # RESOURCE_LIMITS, so client-runtime's jobs_manager.py only falls back to its
+  # own default when they are absent. Pin the rendered value here so the two
+  # cannot silently diverge again, on BOTH containers that receive it
+  # (api + pods-monitor).
+  - it: defaults spawned-job RESOURCE_REQUESTS/LIMITS to cpu=2,memory=8Gi (req==limit => Guaranteed QoS)
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: RESOURCE_REQUESTS
+            value: "cpu=2,memory=8Gi"
+          count: 1
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: RESOURCE_LIMITS
+            value: "cpu=2,memory=8Gi"
+          count: 1
+      - contains:
+          path: spec.template.spec.containers[1].env
+          content:
+            name: RESOURCE_REQUESTS
+            value: "cpu=2,memory=8Gi"
+          count: 1
+      - contains:
+          path: spec.template.spec.containers[1].env
+          content:
+            name: RESOURCE_LIMITS
+            value: "cpu=2,memory=8Gi"
+          count: 1
+
+  - it: lets operators override spawned-job resources via env.RESOURCE_REQUESTS/LIMITS
+    set:
+      env:
+        RESOURCE_REQUESTS: "cpu=4,memory=16Gi"
+        RESOURCE_LIMITS: "cpu=4,memory=16Gi"
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: RESOURCE_REQUESTS
+            value: "cpu=4,memory=16Gi"
+          count: 1
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: RESOURCE_LIMITS
+            value: "cpu=4,memory=16Gi"
+          count: 1
+