From c9a5e8e885ef2d4bfa84db11a6c0111d1f9d3892 Mon Sep 17 00:00:00 2001 From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:54:16 +0200 Subject: [PATCH 01/24] Merge pull request #88 from tracebloc/ci/add-wip-limit-caller ci: add WIP-limit-check caller workflow --- .github/workflows/wip-limit-check.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/workflows/wip-limit-check.yml diff --git a/.github/workflows/wip-limit-check.yml b/.github/workflows/wip-limit-check.yml new file mode 100644 index 0000000..8c027df --- /dev/null +++ b/.github/workflows/wip-limit-check.yml @@ -0,0 +1,10 @@ +name: WIP limit check + +on: + pull_request: + types: [opened, ready_for_review] + +jobs: + check: + uses: tracebloc/.github/.github/workflows/wip-limit-check.yml@main + secrets: inherit From 567d41ce488f806b374b090e72300d9b98094829 Mon Sep 17 00:00:00 2001 From: Syed Is Saqlain Date: Tue, 5 May 2026 13:07:30 +0530 Subject: [PATCH 02/24] feat(requests-proxy): register requests-proxy in Helm chart (#95) * feat(requests-proxy): register requests-proxy in Helm chart - Add requests-proxy Deployment and Service templates - Auto-generate requests-proxy-admin token on first install (preserved across upgrades via lookup; override with requestsProxyAdminToken) - Inject REQUESTS_PROXY_ADMIN_TOKEN into jobs-manager via the same secret - Add images.requestsProxy and resources.requestsProxy values Co-Authored-By: Claude Sonnet 4.6 * Update order of setting request proxy admin token * Bugbot Fix YAML * Bugbot fix add validation for request proxy --------- Co-authored-by: Syed Saqlain Co-authored-by: Claude Sonnet 4.6 --- client/templates/jobs-manager-deployment.yaml | 5 ++ .../templates/requests-proxy-deployment.yaml | 72 +++++++++++++++++++ client/templates/requests-proxy-service.yaml | 17 +++++ client/templates/secrets.yaml | 21 ++++++ client/tests/node_agents_namespace_test.yaml | 18 ++--- client/tests/secrets_test.yaml | 36 ++++++++++ client/values.schema.json | 32 +++++++++ client/values.yaml | 16 +++++ 8 files changed, 209 insertions(+), 8 deletions(-) create mode 100644 client/templates/requests-proxy-deployment.yaml create mode 100644 client/templates/requests-proxy-service.yaml diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index cf02e20..f60c55f 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -62,6 +62,11 @@ spec: secretKeyRef: name: {{ include "tracebloc.secretName" . }} key: CLIENT_PASSWORD + - name: REQUESTS_PROXY_ADMIN_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-requests-proxy-admin + key: token - name: CLIENT_PVC value: {{ include "tracebloc.clientDataPvc" . | quote }} - name: CLIENT_LOGS_PVC diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml new file mode 100644 index 0000000..d749445 --- /dev/null +++ b/client/templates/requests-proxy-deployment.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-requests-proxy + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + app: requests-proxy + template: + metadata: + labels: + app: requests-proxy + spec: + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: proxy + image: {{ include "tracebloc.image" (dict "repository" "tracebloc/jobs-manager" "tag" .Values.env.CLIENT_ENV "digest" .Values.images.requestsProxy.digest "registry" "docker.io") | quote }} + imagePullPolicy: {{ if .Values.images.requestsProxy.digest }}IfNotPresent{{ else }}Always{{ end }} + workingDir: /app + command: ["python", "-m", "gunicorn"] + args: + - "--bind=0.0.0.0:8888" + # The proxy stores registered pod tokens in process-local memory, + # so it must run as a single worker unless the registry moves to + # shared storage. + - "--workers=1" + - "--worker-tmp-dir=/dev/shm" + - "--threads=4" + - "--graceful-timeout=30" + - "--timeout=60" + - "--access-logfile=-" + - "--error-logfile=-" + - "requests_proxy_server:create_app()" + ports: + - containerPort: 8888 + securityContext: + runAsNonRoot: true + runAsUser: 1001 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + resources: + requests: + cpu: {{ .Values.resources.requestsProxy.requests.cpu | default "100m" | quote }} + memory: {{ .Values.resources.requestsProxy.requests.memory | default "256Mi" | quote }} + limits: + cpu: {{ .Values.resources.requestsProxy.limits.cpu | default "500m" | quote }} + memory: {{ .Values.resources.requestsProxy.limits.memory | default "512Mi" | quote }} + env: + - name: REQUESTS_PROXY_ADMIN_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-requests-proxy-admin + key: token + - name: EXPERIMENTS_QUEUE_NAME + value: "experiments" + - name: FLOPS_QUEUE_NAME + value: "flops" + {{- if include "tracebloc.useImagePullSecrets" . }} + imagePullSecrets: + - name: {{ include "tracebloc.registrySecretName" . }} + {{- end }} + restartPolicy: Always diff --git a/client/templates/requests-proxy-service.yaml b/client/templates/requests-proxy-service.yaml new file mode 100644 index 0000000..22cfe8f --- /dev/null +++ b/client/templates/requests-proxy-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: requests-proxy-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + app: requests-proxy +spec: + selector: + app: requests-proxy + ports: + - name: http + port: 8888 + targetPort: 8888 + protocol: TCP + type: ClusterIP diff --git a/client/templates/secrets.yaml b/client/templates/secrets.yaml index 65242b4..620a5e7 100644 --- a/client/templates/secrets.yaml +++ b/client/templates/secrets.yaml @@ -13,6 +13,27 @@ type: Opaque data: CLIENT_ID: {{ $clientId | b64enc | quote }} CLIENT_PASSWORD: {{ $clientPassword | b64enc | quote }} +--- +{{ $proxySecretName := printf "%s-requests-proxy-admin" .Release.Name -}} +{{- $existingProxySecret := lookup "v1" "Secret" .Release.Namespace $proxySecretName -}} +{{- $proxyAdminToken := "" -}} +{{- if .Values.requestsProxyAdminToken -}} +{{- $proxyAdminToken = .Values.requestsProxyAdminToken -}} +{{- else if and $existingProxySecret $existingProxySecret.data (index $existingProxySecret.data "token") -}} +{{- $proxyAdminToken = index $existingProxySecret.data "token" | b64dec -}} +{{- else -}} +{{- $proxyAdminToken = randAlphaNum 64 -}} +{{- end -}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ $proxySecretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} +type: Opaque +data: + token: {{ $proxyAdminToken | b64enc | quote }} {{- if and (ne .Values.resourceMonitor false) (ne .Values.nodeAgents.namespace.name .Release.Namespace) }} --- # Mirrored into the node-agents namespace so the resource-monitor DaemonSet diff --git a/client/tests/node_agents_namespace_test.yaml b/client/tests/node_agents_namespace_test.yaml index 1fe1f7a..aea6431 100644 --- a/client/tests/node_agents_namespace_test.yaml +++ b/client/tests/node_agents_namespace_test.yaml @@ -186,7 +186,7 @@ tests: template: templates/secrets.yaml asserts: - hasDocuments: - count: 2 + count: 3 - isKind: of: Secret documentIndex: 0 @@ -196,21 +196,21 @@ tests: documentIndex: 0 - isKind: of: Secret - documentIndex: 1 + documentIndex: 2 - equal: path: metadata.namespace value: tracebloc-node-agents - documentIndex: 1 + documentIndex: 2 - equal: path: metadata.name value: RELEASE-NAME-secrets - documentIndex: 1 + documentIndex: 2 - isNotEmpty: path: data.CLIENT_ID - documentIndex: 1 + documentIndex: 2 - isNotEmpty: path: data.CLIENT_PASSWORD - documentIndex: 1 + documentIndex: 2 - it: should not mirror the tracebloc Secret when resourceMonitor is disabled template: templates/secrets.yaml @@ -218,10 +218,11 @@ tests: resourceMonitor: false asserts: - hasDocuments: - count: 1 + count: 2 - equal: path: metadata.namespace value: tracebloc-templates + documentIndex: 0 - it: should not mirror the tracebloc Secret when node-agents namespace equals release namespace template: templates/secrets.yaml @@ -232,10 +233,11 @@ tests: name: tracebloc-templates asserts: - hasDocuments: - count: 1 + count: 2 - equal: path: metadata.namespace value: tracebloc-templates + documentIndex: 0 - it: should mirror the docker registry Secret into the node-agents namespace when in use template: templates/docker-registry-secret.yaml diff --git a/client/tests/secrets_test.yaml b/client/tests/secrets_test.yaml index 3feab58..0505893 100644 --- a/client/tests/secrets_test.yaml +++ b/client/tests/secrets_test.yaml @@ -11,19 +11,55 @@ tests: asserts: - isKind: of: Secret + documentIndex: 0 - equal: path: metadata.name value: RELEASE-NAME-secrets + documentIndex: 0 - equal: path: type value: Opaque + documentIndex: 0 - isNotEmpty: path: data.CLIENT_ID + documentIndex: 0 - isNotEmpty: path: data.CLIENT_PASSWORD + documentIndex: 0 - matchRegex: path: metadata.labels["app.kubernetes.io/managed-by"] pattern: Helm + documentIndex: 0 + + - it: should use explicit requests-proxy admin token override + template: templates/secrets.yaml + set: + clientId: "my-client-id" + clientPassword: "my-secret-pass" + requestsProxyAdminToken: "override-token" + asserts: + - equal: + path: metadata.name + value: RELEASE-NAME-requests-proxy-admin + documentIndex: 1 + - equal: + path: data.token + value: b3ZlcnJpZGUtdG9rZW4= + documentIndex: 1 + + - it: should generate requests-proxy admin token when no override is provided + template: templates/secrets.yaml + set: + clientId: "my-client-id" + clientPassword: "my-secret-pass" + asserts: + - equal: + path: metadata.name + value: RELEASE-NAME-requests-proxy-admin + documentIndex: 1 + - isNotEmpty: + path: data.token + documentIndex: 1 - it: should create docker registry secret when create is true template: templates/docker-registry-secret.yaml diff --git a/client/values.schema.json b/client/values.schema.json index 67d39b2..709f7c1 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -284,6 +284,15 @@ } } }, + "requestsProxy": { + "type": "object", + "properties": { + "digest": { + "type": "string", + "pattern": "^(sha256:[a-f0-9]{64})?$" + } + } + }, "mysqlClient": { "type": "object", "properties": { @@ -374,6 +383,25 @@ } } } + }, + "requestsProxy": { + "type": "object", + "properties": { + "requests": { + "type": "object", + "properties": { + "cpu": { "type": "string", "pattern": "^[0-9]+m?$" }, + "memory": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti)$" } + } + }, + "limits": { + "type": "object", + "properties": { + "cpu": { "type": "string", "pattern": "^[0-9]+m?$" }, + "memory": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti)$" } + } + } + } } } }, @@ -428,6 +456,10 @@ "not": { "pattern": "^<.*>$" }, "description": "Client authentication password. Must be a real value, not a placeholder like ." }, + "requestsProxyAdminToken": { + "type": "string", + "description": "Optional admin token override for the requests-proxy service." + }, "autoUpgrade": { "type": "object", "description": "Self-upgrade CronJob (issue tracebloc/client#69). Polls the helm repo at autoUpgrade.repoUrl daily and runs `helm upgrade --reuse-values` when a newer chart version is published. Disable to keep the release pinned to its install-time chart version.", diff --git a/client/values.yaml b/client/values.yaml index ca70c63..2e6c40d 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -183,6 +183,8 @@ images: digest: "" resourceMonitor: digest: "" + requestsProxy: + digest: "" mysqlClient: # mysql-client is only published under the "prod" tag — it has no dev/ # staging variants, so we decouple it from env.CLIENT_ENV. Override only @@ -230,6 +232,13 @@ resources: limits: cpu: "500m" memory: "512Mi" + requestsProxy: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" # -- PriorityClass for the data-plane (mysql). # Cluster-scoped resource. Created with helm.sh/resource-policy: keep so a @@ -267,6 +276,13 @@ podDisruptionBudget: clientId: "" clientPassword: "" +# -- Admin token for the requests-proxy service. +# Optional. If set, this value overrides any existing requests-proxy admin +# secret on install or upgrade. When left empty, the chart reuses the current +# secret if present or generates a new token on first install. +# Do NOT commit real tokens to version control. +requestsProxyAdminToken: "" + # -- Docker registry credentials (optional; only used when dockerRegistry is set and create is true) # Omit dockerRegistry entirely, or set create: false, for public images (no imagePullSecrets). # When create is true, secret name is {{ .Release.Name }}-regcred. From 0977535df581819eabebd924d4c488b4f9317283 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 6 May 2026 18:29:39 +0500 Subject: [PATCH 03/24] Merge pull request #106 from tracebloc/docs/drop-stale-helm-charts-refs-105 docs: drop stale tracebloc-helm-charts references in INSTALL.md --- README.md | 47 ++++++++++++++++++++++++++++++++++++++++------- docs/INSTALL.md | 13 ++++++++----- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e7f207c..787c113 100644 --- a/README.md +++ b/README.md @@ -50,18 +50,51 @@ For the threat model, defense layers, per-platform caveats, operator responsibil ## Deploy +This repo ships the **tracebloc** unified Helm chart (currently `v1.3.1`) — one chart for AKS, EKS, bare-metal, and OpenShift. + +### Quick install + +A single command provisions a Kubernetes cluster, auto-detects and installs GPU drivers (NVIDIA or AMD), and deploys the tracebloc client. Use this when you don't already have a cluster — the result is a full client install, not a demo. + +**macOS / Linux** + +```bash +bash <(curl -fsSL https://tracebloc.io/i.sh) +``` + +**Windows** *(PowerShell as Administrator)* + +```powershell +irm https://tracebloc.io/i.ps1 | iex +``` + +The installer pulls helper scripts from this repo at runtime — see [`scripts/install-k8s.sh`](scripts/install-k8s.sh) and [`scripts/install-k8s.ps1`](scripts/install-k8s.ps1). + +### Helm install + +For existing Kubernetes clusters: + ```bash -docker pull tracebloc/client:latest +helm repo add tracebloc https://tracebloc.github.io/client +helm repo update +helm install my-tracebloc tracebloc/tracebloc \ + --namespace tracebloc --create-namespace \ + -f my-values.yaml ``` -Deployment varies by infrastructure. Follow the guide for your setup: +Full deployment guide → **[docs/INSTALL.md](docs/INSTALL.md)** (prerequisites, required values, upgrade & rollback, air-gapped install). + +| Topic | Where to look | +|---|---| +| Production install + required values | [docs/INSTALL.md](docs/INSTALL.md) | +| Threat model & operator responsibilities | [docs/SECURITY.md](docs/SECURITY.md) | +| Migrating from `eks-1.0.x` / `aks-*` charts to `client-1.x` | [docs/MIGRATIONS.md](docs/MIGRATIONS.md) | +| Per-tenant migration runbook | [docs/migration-tools/README.md](docs/migration-tools/README.md) | +| Per-platform value mapping | [client/MIGRATION.md](client/MIGRATION.md) | -- [Deployment overview](https://docs.tracebloc.io/environment-setup/deployment-overview) -- [Local — Linux](https://docs.tracebloc.io/environment-setup/local-linux) -- [Local — macOS](https://docs.tracebloc.io/environment-setup/local-macos) -- [AWS](https://docs.tracebloc.io/environment-setup/aws) +Platform-specific walkthroughs: [Linux](https://docs.tracebloc.io/environment-setup/local-deployment-guide-linux) · [macOS](https://docs.tracebloc.io/environment-setup/local-deployment-guide-macos) · [EKS](https://docs.tracebloc.io/environment-setup/eks-client-deployment-guide) · [Azure / AKS](https://docs.tracebloc.io/environment-setup/azure-deployment-guide) -Full documentation → [docs.tracebloc.io](https://docs.tracebloc.io/) +> **NetworkPolicy required.** The chart's training-pod egress lockdown only takes effect on a CNI that enforces NetworkPolicy. See [SECURITY.md § Per-platform caveats](docs/SECURITY.md#5-per-platform-caveats). ## Links diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 151d900..81690c1 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -2,6 +2,13 @@ This guide covers installing the **tracebloc** unified Helm chart (AKS, EKS, bare-metal, OpenShift) in a production-ready way. +> **Don't have a Kubernetes cluster yet?** The standalone installer provisions a cluster, installs GPU drivers, and deploys a full tracebloc client in a single command: +> +> - **macOS / Linux:** `bash <(curl -fsSL https://tracebloc.io/i.sh)` +> - **Windows:** `irm https://tracebloc.io/i.ps1 | iex` *(PowerShell as Administrator)* +> +> See the [README's Quick install section](../README.md#quick-install) for what it does. Continue here if you're deploying into an existing cluster. + --- ## Prerequisites @@ -200,7 +207,7 @@ The chart repository used for installation is **[tracebloc/client](https://githu To make the chart available via `helm repo add tracebloc https://tracebloc.github.io/client`: -1. **In the repo that hosts the chart (e.g. tracebloc/client or tracebloc-helm-charts):** +1. **In the tracebloc/client repo:** Enable **GitHub Pages** → **Settings** → **Pages** → **Source**: branch `gh-pages` (root). 2. **Create a release or push a tag** @@ -217,8 +224,6 @@ To make the chart available via `helm repo add tracebloc https://tracebloc.githu 4. **First time only:** ensure the `gh-pages` branch exists. The workflow creates it if missing. -5. **If you develop in a different repo** (e.g. tracebloc-helm-charts): run the release workflow there to build the chart, then copy the generated `tracebloc-.tgz` and updated `index.yaml` into the **tracebloc/client** repo’s `gh-pages` branch so the chart is served at `https://tracebloc.github.io/client`. - After that, users can run: ```bash @@ -226,8 +231,6 @@ helm repo add tracebloc https://tracebloc.github.io/client helm install my-tracebloc tracebloc/tracebloc -n tracebloc -f my-values.yaml ``` -**Note:** If the chart is developed in a different repo (e.g. `tracebloc-helm-charts`), run the release workflow there to produce the `.tgz` and `index.yaml`, then copy the packaged chart and updated index into the `tracebloc/client` repo’s `gh-pages` branch (or run the same release workflow from the client repo) so the chart is served at `https://tracebloc.github.io/client`. - --- ## Pre-install checklist (production) From 111be3ea10bdd63c4decd765da2aab76f7ad0b3f Mon Sep 17 00:00:00 2001 From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com> Date: Wed, 6 May 2026 17:40:02 +0200 Subject: [PATCH 04/24] ci: add FR-pass comment caller for multi-stage kanban flow --- .github/workflows/fr-pass-comment-caller.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/fr-pass-comment-caller.yml diff --git a/.github/workflows/fr-pass-comment-caller.yml b/.github/workflows/fr-pass-comment-caller.yml new file mode 100644 index 0000000..80192e7 --- /dev/null +++ b/.github/workflows/fr-pass-comment-caller.yml @@ -0,0 +1,14 @@ +name: FR pass comment + +# Per-repo caller. Listens for /fr-pass comments and advances kanban items +# from "FR on dev" → "Ready for staging" or "FR on staging" → "Ready for prod". +# All logic lives in tracebloc/.github/.github/workflows/fr-pass-comment.yml. + +on: + issue_comment: + types: [created] + +jobs: + advance: + uses: tracebloc/.github/.github/workflows/fr-pass-comment.yml@main + secrets: inherit From b69c5f6badb872b5f4ab2350a3d7ccb892d9ba9d Mon Sep 17 00:00:00 2001 From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com> Date: Wed, 6 May 2026 17:40:03 +0200 Subject: [PATCH 05/24] ci: add FR gate caller for staging/main promotions --- .github/workflows/fr-gate-caller.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .github/workflows/fr-gate-caller.yml diff --git a/.github/workflows/fr-gate-caller.yml b/.github/workflows/fr-gate-caller.yml new file mode 100644 index 0000000..b725e2c --- /dev/null +++ b/.github/workflows/fr-gate-caller.yml @@ -0,0 +1,15 @@ +name: FR gate + +# Per-repo caller. Blocks merges to staging/main/master unless every contained +# kanban item is in "Ready for staging" or "Ready for prod" respectively. +# All logic lives in tracebloc/.github/.github/workflows/fr-gate.yml. + +on: + pull_request: + branches: [staging, main, master] + types: [opened, reopened, synchronize, ready_for_review, labeled, unlabeled] + +jobs: + gate: + uses: tracebloc/.github/.github/workflows/fr-gate.yml@main + secrets: inherit From 50b9c897e37f35c8098c5bbfa876a10eff9fb32d Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Thu, 7 May 2026 15:12:35 +0500 Subject: [PATCH 06/24] =?UTF-8?q?chore:=20sync=20main=20=E2=86=92=20develo?= =?UTF-8?q?p=20after=20misrouted=20docs=20PRs=20(#108)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: fix README Deploy section (Helm not docker), surface in-repo docs The Deploy section opened with `docker pull tracebloc/client:latest`, but this repo ships a Helm chart — the actual install is `helm install`. External walkthrough URLs (`/local-linux`, `/local-macos`, `/aws`, `/deployment-overview`) didn't match any path in the tracebloc/docs tree, so they 404. The in-repo documentation (`docs/INSTALL.md`, `docs/MIGRATIONS.md`, `docs/migration-tools/README.md`, `client/MIGRATION.md`) was never linked from the README despite being the operational source of truth. Surgical change — the rest of the README stays as-is: - Replace `docker pull` with `helm repo add` + `helm install` (matches docs/INSTALL.md) - Call out chart version (v1.3.1) and platform support (AKS / EKS / bare-metal / OpenShift) up front - Table linking every in-repo operational doc - Fix external URLs to match actual tracebloc/docs paths (local-deployment-guide-linux, local-deployment-guide-macos, eks-client-deployment-guide, azure-deployment-guide) - Pull NetworkPolicy/CNI prerequisite into a callout Closes #101 Co-Authored-By: Claude Opus 4.7 (1M context) * docs: surface standalone installer in README and INSTALL.md The standalone installer (bash <(curl -fsSL tracebloc.io/i.sh) / irm tracebloc.io/i.ps1 | iex) is the one-command path for evaluation, local dev, and first-time installs — it provisions a cluster, detects GPU drivers, and deploys the client. Today it isn't documented anywhere reachable from this repo, so readers see the multi-step helm install flow as the only option. README: - New "Quick install" subsection at the top of Deploy with macOS/Linux and Windows commands, brief description of what it does, and a pointer to the local helper scripts under scripts/ - Existing helm flow relabeled as "Helm install (production)" — now positioned as the option for existing production clusters docs/INSTALL.md: - Top-of-doc callout pointing at the standalone installer for non-production users - Production-focused content untouched Closes #103 Co-Authored-By: Claude Opus 4.7 (1M context) * docs: reframe Quick install — same client, different cluster path Previous wording ("Best for evaluation, local dev, and first-time installs" / "Just trying it out? For local dev or a quick evaluation") implied the standalone installer produces a lesser/demo client. It doesn't — it produces the same full client, just on a cluster the script provisions for you. Reframes the differentiator around cluster ownership instead of install quality: - README: "Use this when you don't already have a cluster — the result is a full client install, not a demo." Helm subsection retitled from "Helm install (production)" to just "Helm install" with "For existing Kubernetes clusters". - INSTALL.md: callout opens with "Don't have a Kubernetes cluster yet?" and emphasizes "a full tracebloc client". Refs #103 * docs: explicit https:// on installer URLs (security) curl and PowerShell's irm both default to HTTP when no scheme is specified, so `curl -fsSL tracebloc.io/i.sh` and `irm tracebloc.io/i.ps1` issue plaintext requests. The downloaded body is piped straight into bash / iex, so a network-level attacker between the user and tracebloc.io could MITM the response and inject arbitrary code. Add explicit `https://` to every installer URL in README.md and docs/INSTALL.md so the request is encrypted from the first byte. Refs #103 * ci: bootstrap FR-pass caller on main * ci: bootstrap FR gate caller on main --------- Co-authored-by: Lukas Wuttke Co-authored-by: Claude Opus 4.7 (1M context) Co-authored-by: lukasWuttke <54042461+LukasWodka@users.noreply.github.com> From bb1ad5506b7fc0333241978ff6f0abab871d7bf6 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Thu, 7 May 2026 15:28:14 +0500 Subject: [PATCH 07/24] chore(auto-upgrade): run cronjob hourly at :23 (#112) Switches the auto-upgrade CronJob default schedule from "23 2 * * *" (daily 02:23 UTC) to "23 * * * *" (hourly at :23). Co-authored-by: Claude Opus 4.7 (1M context) --- client/tests/auto_upgrade_test.yaml | 2 +- client/values.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/client/tests/auto_upgrade_test.yaml b/client/tests/auto_upgrade_test.yaml index 7af68bd..57d36bb 100644 --- a/client/tests/auto_upgrade_test.yaml +++ b/client/tests/auto_upgrade_test.yaml @@ -77,7 +77,7 @@ tests: value: stg-auto-upgrade - equal: path: spec.schedule - value: "23 2 * * *" + value: "23 * * * *" - equal: path: spec.concurrencyPolicy value: Forbid diff --git a/client/values.yaml b/client/values.yaml index 2e6c40d..dab4804 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -325,9 +325,9 @@ autoUpgrade: # Default ON: the entire point of #69 is that customers freeze on the # version they installed. Defaulting off would recreate that exact bug. enabled: true - # Daily at 02:23 UTC. The off-hour minute spreads load across the + # Hourly at :23. The off-hour minute spreads load across the # tracebloc.github.io/client GitHub Pages origin. - schedule: "23 2 * * *" + schedule: "23 * * * *" # Helm chart repo to poll. Override only when mirroring internally. repoUrl: "https://tracebloc.github.io/client" repoName: "tracebloc" From 19b8158ea78a827fd9c436358d47702c3f7b4905 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Thu, 7 May 2026 16:18:43 +0500 Subject: [PATCH 08/24] Merge pull request #115 from tracebloc/chore/bump-chart-1.3.2-develop chore(client): bump chart 1.3.1 -> 1.3.2 (develop sync) --- client/Chart.yaml | 4 ++-- client/MIGRATION.md | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/client/Chart.yaml b/client/Chart.yaml index afbcdcf..23fecef 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.3.1 -appVersion: "1.3.1" +version: 1.3.2 +appVersion: "1.3.2" keywords: - tracebloc - kubernetes diff --git a/client/MIGRATION.md b/client/MIGRATION.md index 90e3a90..c507eb6 100644 --- a/client/MIGRATION.md +++ b/client/MIGRATION.md @@ -5,11 +5,16 @@ This guide explains how to migrate from the legacy per-platform charts (`aks/`, ## Upgrading to 1.3.0 — self-upgrade CronJob lands on by default Releases of 1.3.0+ install a `-auto-upgrade` CronJob that polls -`https://tracebloc.github.io/client` daily and runs +`https://tracebloc.github.io/client` and runs `helm upgrade --reset-then-reuse-values` when a newer chart version is published. This closes [tracebloc/client#69](https://github.com/tracebloc/client/issues/69) — older deployed clients stop drifting from the latest secure / stable release. +The default cadence is **hourly at :23 UTC** as of 1.3.2 (was daily at 02:23 +UTC in 1.3.0 / 1.3.1). The off-hour minute spreads load across the +`tracebloc.github.io/client` GitHub Pages origin. Operators who want a +different schedule can override `autoUpgrade.schedule`. + > **Verified end-to-end on `tb-client-dev-templates` during the 1.3.1 release**: > a `tracebloc` release at 1.3.0 self-upgraded to 1.3.1 within a single > CronJob tick after publish, with no operator intervention. From 578bff9f1371347aad9ff869bd1fc22764fc171e Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Thu, 7 May 2026 17:34:31 +0500 Subject: [PATCH 09/24] ci: drop push-tags trigger from release-helm-chart workflow (#117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: drop push-tags trigger from release-helm-chart workflow `gh release create v` (the established release path per `gh release list`) fires both `push` (tag) and `release` (published) events, which causes two parallel workflow runs to race for the gh-pages push. The slower run fails with non-fast-forward. Most recent example: v1.3.2 cut today — run 25492826437 (release event) failed; run 25492826350 (push event) succeeded. Artifacts landed fine, but the failed sibling shows up as a red X on the release and is noise for anyone debugging future releases. Keeping only `release: published` removes the race. The `Upload chart to GitHub Release (on tag)` step's `startsWith(github.ref, 'refs/tags/')` guard still evaluates true for release events (`github.ref` is the tag ref), so the upload step behaviour is preserved. Closes #116 * ci: harden release-asset upload against actions/runner#2788 With the push-tags trigger removed, the upload step's `if: startsWith(github.ref, 'refs/tags/')` guard is the only thing keeping the upload from running, but it silently evaluates to false when `github.ref` arrives empty — a known intermittent runner bug (actions/runner#2788, still open as of 2026-05). The same bug also affects `github.ref_name`, which softprops/action-gh-release@v2 uses by default to derive the tag, so the action itself can target the wrong release (or fail) when the bug fires. Drop the now-redundant `if:` guard (the workflow only runs on `release: published`, so every run is by definition a release event) and pass `tag_name` explicitly from the release event payload, which is unaffected by the bug. * ci: pin checkout ref to release tag (actions/runner#2788 hardening) actions/checkout@v4 defaults `ref` to github.ref, which is the same field hit by actions/runner#2788 — the still-open intermittent bug where github.ref arrives empty on release-triggered runs. Per the action's docs, when "checking out the repository that triggered a workflow, this defaults to the reference or SHA for that event. Otherwise, uses the default branch." So an empty github.ref would fall back to the repo default branch (develop here), and we'd package the chart from develop's HEAD instead of the tagged commit. Pin ref explicitly to github.event.release.tag_name, which is part of the release event payload and is unaffected by the runner bug. --- .github/workflows/release-helm-chart.yaml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release-helm-chart.yaml b/.github/workflows/release-helm-chart.yaml index 5593c24..4bc015b 100644 --- a/.github/workflows/release-helm-chart.yaml +++ b/.github/workflows/release-helm-chart.yaml @@ -6,10 +6,6 @@ name: Release Helm Chart on: - push: - tags: - - 'client-v*' - - 'v*' release: types: [published] @@ -26,8 +22,14 @@ jobs: id-token: write steps: - name: Checkout + # Pin ref to the release tag rather than letting checkout default to + # github.ref, which intermittently arrives empty on release-triggered + # runs (actions/runner#2788, still open). An empty default would cause + # checkout to fall back to the repo default branch and package the + # chart from the wrong commit. uses: actions/checkout@v4 with: + ref: ${{ github.event.release.tag_name }} fetch-depth: 0 - name: Set up Helm @@ -101,10 +103,13 @@ jobs: git push origin gh-pages fi - - name: Upload chart to GitHub Release (on tag) - if: startsWith(github.ref, 'refs/tags/') + - name: Upload chart to GitHub Release + # Pin tag_name from the release event payload rather than relying on + # github.ref / github.ref_name, which intermittently arrive empty on + # release-triggered runs (actions/runner#2788, still open). uses: softprops/action-gh-release@v2 with: + tag_name: ${{ github.event.release.tag_name }} files: client-*.tgz generate_release_notes: true env: From 9195f604a057f4bb09f092bbe62186b2627c5e3c Mon Sep 17 00:00:00 2001 From: Syed Is Saqlain Date: Tue, 12 May 2026 14:24:35 +0530 Subject: [PATCH 10/24] Add MySQL Host to request proxy yaml file (#118) Co-authored-by: Syed Saqlain --- client/templates/requests-proxy-deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml index d749445..3423137 100644 --- a/client/templates/requests-proxy-deployment.yaml +++ b/client/templates/requests-proxy-deployment.yaml @@ -61,6 +61,8 @@ spec: secretKeyRef: name: {{ .Release.Name }}-requests-proxy-admin key: token + - name: MYSQL_HOST + value: "mysql-client" - name: EXPERIMENTS_QUEUE_NAME value: "experiments" - name: FLOPS_QUEUE_NAME From af26953646d0060e23b9d1e42e66f97b2ca63eec Mon Sep 17 00:00:00 2001 From: Syed Is Saqlain Date: Tue, 12 May 2026 18:15:46 +0530 Subject: [PATCH 11/24] Add request proxy url to jobs manager yaml file (#119) Co-authored-by: Syed Saqlain --- client/templates/jobs-manager-deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index f60c55f..a9d5da8 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -73,6 +73,8 @@ spec: value: {{ include "tracebloc.clientLogsPvc" . | quote }} - name: MYSQL_HOST value: "mysql-client" + - name: REQUESTS_PROXY_URL + value: "http://requests-proxy-service:8888" - name: JOB_IMAGE_HOST value: "docker.io/" - name: CLIENT_ENV From 0fe120e6b033be68ebfbc6499d74dc5ef836d526 Mon Sep 17 00:00:00 2001 From: Syed Is Saqlain Date: Wed, 13 May 2026 14:56:32 +0530 Subject: [PATCH 12/24] Remove REQUESTS_PROXY_ADMIN_TOKEN (#120) Co-authored-by: Syed Saqlain --- client/templates/jobs-manager-deployment.yaml | 5 ---- .../templates/requests-proxy-deployment.yaml | 5 ---- client/templates/secrets.yaml | 21 ------------- client/tests/node_agents_namespace_test.yaml | 16 +++++----- client/tests/secrets_test.yaml | 30 ------------------- client/values.schema.json | 4 --- client/values.yaml | 7 ----- 7 files changed, 8 insertions(+), 80 deletions(-) diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index a9d5da8..374d845 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -62,11 +62,6 @@ spec: secretKeyRef: name: {{ include "tracebloc.secretName" . }} key: CLIENT_PASSWORD - - name: REQUESTS_PROXY_ADMIN_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Release.Name }}-requests-proxy-admin - key: token - name: CLIENT_PVC value: {{ include "tracebloc.clientDataPvc" . | quote }} - name: CLIENT_LOGS_PVC diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml index 3423137..84b6eca 100644 --- a/client/templates/requests-proxy-deployment.yaml +++ b/client/templates/requests-proxy-deployment.yaml @@ -56,11 +56,6 @@ spec: cpu: {{ .Values.resources.requestsProxy.limits.cpu | default "500m" | quote }} memory: {{ .Values.resources.requestsProxy.limits.memory | default "512Mi" | quote }} env: - - name: REQUESTS_PROXY_ADMIN_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Release.Name }}-requests-proxy-admin - key: token - name: MYSQL_HOST value: "mysql-client" - name: EXPERIMENTS_QUEUE_NAME diff --git a/client/templates/secrets.yaml b/client/templates/secrets.yaml index 620a5e7..65242b4 100644 --- a/client/templates/secrets.yaml +++ b/client/templates/secrets.yaml @@ -13,27 +13,6 @@ type: Opaque data: CLIENT_ID: {{ $clientId | b64enc | quote }} CLIENT_PASSWORD: {{ $clientPassword | b64enc | quote }} ---- -{{ $proxySecretName := printf "%s-requests-proxy-admin" .Release.Name -}} -{{- $existingProxySecret := lookup "v1" "Secret" .Release.Namespace $proxySecretName -}} -{{- $proxyAdminToken := "" -}} -{{- if .Values.requestsProxyAdminToken -}} -{{- $proxyAdminToken = .Values.requestsProxyAdminToken -}} -{{- else if and $existingProxySecret $existingProxySecret.data (index $existingProxySecret.data "token") -}} -{{- $proxyAdminToken = index $existingProxySecret.data "token" | b64dec -}} -{{- else -}} -{{- $proxyAdminToken = randAlphaNum 64 -}} -{{- end -}} -apiVersion: v1 -kind: Secret -metadata: - name: {{ $proxySecretName }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "tracebloc.labels" . | nindent 4 }} -type: Opaque -data: - token: {{ $proxyAdminToken | b64enc | quote }} {{- if and (ne .Values.resourceMonitor false) (ne .Values.nodeAgents.namespace.name .Release.Namespace) }} --- # Mirrored into the node-agents namespace so the resource-monitor DaemonSet diff --git a/client/tests/node_agents_namespace_test.yaml b/client/tests/node_agents_namespace_test.yaml index aea6431..67d4214 100644 --- a/client/tests/node_agents_namespace_test.yaml +++ b/client/tests/node_agents_namespace_test.yaml @@ -186,7 +186,7 @@ tests: template: templates/secrets.yaml asserts: - hasDocuments: - count: 3 + count: 2 - isKind: of: Secret documentIndex: 0 @@ -196,21 +196,21 @@ tests: documentIndex: 0 - isKind: of: Secret - documentIndex: 2 + documentIndex: 1 - equal: path: metadata.namespace value: tracebloc-node-agents - documentIndex: 2 + documentIndex: 1 - equal: path: metadata.name value: RELEASE-NAME-secrets - documentIndex: 2 + documentIndex: 1 - isNotEmpty: path: data.CLIENT_ID - documentIndex: 2 + documentIndex: 1 - isNotEmpty: path: data.CLIENT_PASSWORD - documentIndex: 2 + documentIndex: 1 - it: should not mirror the tracebloc Secret when resourceMonitor is disabled template: templates/secrets.yaml @@ -218,7 +218,7 @@ tests: resourceMonitor: false asserts: - hasDocuments: - count: 2 + count: 1 - equal: path: metadata.namespace value: tracebloc-templates @@ -233,7 +233,7 @@ tests: name: tracebloc-templates asserts: - hasDocuments: - count: 2 + count: 1 - equal: path: metadata.namespace value: tracebloc-templates diff --git a/client/tests/secrets_test.yaml b/client/tests/secrets_test.yaml index 0505893..0113e46 100644 --- a/client/tests/secrets_test.yaml +++ b/client/tests/secrets_test.yaml @@ -31,36 +31,6 @@ tests: pattern: Helm documentIndex: 0 - - it: should use explicit requests-proxy admin token override - template: templates/secrets.yaml - set: - clientId: "my-client-id" - clientPassword: "my-secret-pass" - requestsProxyAdminToken: "override-token" - asserts: - - equal: - path: metadata.name - value: RELEASE-NAME-requests-proxy-admin - documentIndex: 1 - - equal: - path: data.token - value: b3ZlcnJpZGUtdG9rZW4= - documentIndex: 1 - - - it: should generate requests-proxy admin token when no override is provided - template: templates/secrets.yaml - set: - clientId: "my-client-id" - clientPassword: "my-secret-pass" - asserts: - - equal: - path: metadata.name - value: RELEASE-NAME-requests-proxy-admin - documentIndex: 1 - - isNotEmpty: - path: data.token - documentIndex: 1 - - it: should create docker registry secret when create is true template: templates/docker-registry-secret.yaml set: diff --git a/client/values.schema.json b/client/values.schema.json index 709f7c1..5d2a835 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -456,10 +456,6 @@ "not": { "pattern": "^<.*>$" }, "description": "Client authentication password. Must be a real value, not a placeholder like ." }, - "requestsProxyAdminToken": { - "type": "string", - "description": "Optional admin token override for the requests-proxy service." - }, "autoUpgrade": { "type": "object", "description": "Self-upgrade CronJob (issue tracebloc/client#69). Polls the helm repo at autoUpgrade.repoUrl daily and runs `helm upgrade --reuse-values` when a newer chart version is published. Disable to keep the release pinned to its install-time chart version.", diff --git a/client/values.yaml b/client/values.yaml index dab4804..8fd958c 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -276,13 +276,6 @@ podDisruptionBudget: clientId: "" clientPassword: "" -# -- Admin token for the requests-proxy service. -# Optional. If set, this value overrides any existing requests-proxy admin -# secret on install or upgrade. When left empty, the chart reuses the current -# secret if present or generates a new token on first install. -# Do NOT commit real tokens to version control. -requestsProxyAdminToken: "" - # -- Docker registry credentials (optional; only used when dockerRegistry is set and create is true) # Omit dockerRegistry entirely, or set create: false, for public images (no imagePullSecrets). # When create is true, secret name is {{ .Release.Name }}-regcred. From 5b607760d451655af60d0f2da5c2314ae3ae14bf Mon Sep 17 00:00:00 2001 From: Syed Is Saqlain Date: Thu, 14 May 2026 19:22:49 +0530 Subject: [PATCH 13/24] Reduce dependency on values.yaml file for requests proxy (#122) Co-authored-by: Syed Saqlain --- client/templates/requests-proxy-deployment.yaml | 12 ++++++------ client/values.yaml | 7 ------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml index 84b6eca..b0e2346 100644 --- a/client/templates/requests-proxy-deployment.yaml +++ b/client/templates/requests-proxy-deployment.yaml @@ -22,8 +22,8 @@ spec: type: RuntimeDefault containers: - name: proxy - image: {{ include "tracebloc.image" (dict "repository" "tracebloc/jobs-manager" "tag" .Values.env.CLIENT_ENV "digest" .Values.images.requestsProxy.digest "registry" "docker.io") | quote }} - imagePullPolicy: {{ if .Values.images.requestsProxy.digest }}IfNotPresent{{ else }}Always{{ end }} + image: {{ include "tracebloc.image" (dict "repository" "tracebloc/jobs-manager" "tag" .Values.env.CLIENT_ENV "digest" (dig "requestsProxy" "digest" "" .Values.images) "registry" "docker.io") | quote }} + imagePullPolicy: Always workingDir: /app command: ["python", "-m", "gunicorn"] args: @@ -50,11 +50,11 @@ spec: readOnlyRootFilesystem: true resources: requests: - cpu: {{ .Values.resources.requestsProxy.requests.cpu | default "100m" | quote }} - memory: {{ .Values.resources.requestsProxy.requests.memory | default "256Mi" | quote }} + cpu: 100m + memory: 256Mi limits: - cpu: {{ .Values.resources.requestsProxy.limits.cpu | default "500m" | quote }} - memory: {{ .Values.resources.requestsProxy.limits.memory | default "512Mi" | quote }} + cpu: 1000m + memory: 512Mi env: - name: MYSQL_HOST value: "mysql-client" diff --git a/client/values.yaml b/client/values.yaml index 8fd958c..3ab6370 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -232,13 +232,6 @@ resources: limits: cpu: "500m" memory: "512Mi" - requestsProxy: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "500m" - memory: "512Mi" # -- PriorityClass for the data-plane (mysql). # Cluster-scoped resource. Created with helm.sh/resource-policy: keep so a From 62d5530133fd20c8c476671e5297387124e71c72 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Mon, 18 May 2026 17:16:55 +0500 Subject: [PATCH 14/24] feat(#86): ingestor Helm subchart + companion RBAC/service/authz for new ingestion endpoint (#123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: companion chart changes for ingestion endpoint (client-runtime#21) Wires the cluster side of the new ingestion flow into the main client chart so the upcoming ingestor subchart can actually reach jobs-manager. Five small changes: 1. **rbac.yaml** — adds three permissions to jobs-manager's RBAC: - authentication.k8s.io/tokenreviews create - configmaps create - secrets create The endpoint validates caller SA tokens via TokenReview and creates a per-run ConfigMap (ingest.yaml) + Secret (BACKEND_TOKEN) before spawning the ingestor Job. `tokenreviews` is cluster-scoped and only added to the ClusterRole branch; customers with `clusterScope: false` won't have the ingestion endpoint authenticate. Documented in the rule comments. 2. **jobs-manager-service.yaml** (new) — ClusterIP exposing port 8080 at the stable name `jobs-manager`, so the ingestor subchart's post-install hook doesn't need to discover Pod IPs. 3. **jobs-manager-deployment.yaml** — adds containerPort 8080 on the `api` container, mounts the ingestion-authz ConfigMap at `/etc/tracebloc/ingestion-authz.yaml`, declares the corresponding pod-level volume. 4. **ingestion-authz-configmap.yaml** (new) — renders the `ingestionAuthz.allowed` policy customers configure in values.yaml. Mounted into jobs-manager and read at startup by `submit_ingestion_run.load_authz_policy`. Each entry maps (namespace, service_account) → allowed table_prefixes; omitted `namespace` defaults to .Release.Namespace. 5. **values.yaml** — adds the `ingestionAuthz.allowed` default that permits the ingestor subchart's default SA (named `ingestor`) to ingest into any table. Customers tighten via overrides. Verified ──────── - helm lint passes (only pre-existing icon-recommended INFO). - helm template renders all five resources cleanly with expected values (Service name, RBAC verbs, container port, volume mount). - helm unittest: 116/116 tests pass (existing snapshots unchanged). Co-Authored-By: Claude Opus 4.7 (1M context) * feat(#86): ingestor Helm subchart (post-install hook submits to jobs-manager) The customer-facing chart that finally closes the end-to-end loop: helm install my-dataset tracebloc/ingestor --namespace tracebloc \ --set-file ingestConfig=./my-ingest.yaml \ --set image.digest=sha256: Renders the customer's ingest.yaml into a ConfigMap, then a post-install hook Job POSTs `{ingest_config, idempotency_key, image_digest}` to jobs-manager's `/internal/submit-ingestion-run` endpoint (client-runtime#21). jobs-manager validates the SA token via TokenReview, validates the YAML against ingest.v1, mints a backend token, creates the per-run ConfigMap + Secret + Job, returns 201 (or 200 on replay). Layout ────── ingestor/ ├── Chart.yaml appVersion: 0.3.0-rc1 (the data-ingestors release) ├── values.yaml ingestConfig (required, --set-file), image.digest │ (required, sha256), jobsManager.endpoint, │ serviceAccount.create, hook resources, idempotency ├── README.md ownership boundaries + verification commands ├── .helmignore └── templates/ ├── _helpers.tpl ├── serviceaccount.yaml default name "ingestor" ├── configmap-ingest-config.yaml hook-weight 0 └── post-install-job.yaml hook-weight 1, runs as the SA, reads its own token, POSTs. Ownership boundary ────────────────── Per #86's acceptance criteria, the README spells out what `helm uninstall` does and doesn't clean up: This chart owns: ConfigMap (ingest.yaml), the hook Job, the SA. jobs-manager owns: the per-run ConfigMap, Secret, ingestor Job. The cluster owns: the ingested data + metadata POSTed to the backend. `helm uninstall my-dataset` removes only the chart's footprint. The running ingestor Job and its data persist. This is deliberate — uninstall is not a cancel button. The README documents the kubectl command to cancel a run if needed. Implementation choices ────────────────────── - **post-install hook, not a long-lived resource.** The hook is the whole point of this chart — fire once, exit. - **automountServiceAccountToken: true** for the hook Job. That's the whole authentication mechanism — TokenReview on the SA token. Every other tracebloc workload disables automount; this one needs it. - **`hook-delete-policy: before-hook-creation`**, NOT `hook-succeeded`. Keeps the completed Job around so operators can `kubectl logs` the POST response after install. Cleaned up only on the next install under the same release. - **curlimages/curl** as the hook image — small, official, and ships python3 which we use to JSON-encode the multi-line YAML body safely (jq has a JSON-escape edge case for YAML newlines that's easier to side-step than handle). - **idempotencyKey defaults to `-`** so a `helm upgrade` submits a fresh run. Customers override to a stable UUID if they want strict at-most-once across reinstalls. Verified ──────── - helm lint passes. - helm template renders all four resources (ConfigMap, Job, SA, and the inline templates expand cleanly with --set-file ingestConfig). - Required-value gates fire correctly: missing image.digest fails template; missing ingestConfig fails template. Closes #86 Co-Authored-By: Claude Opus 4.7 (1M context) * fix(#86): pre-render JSON body in ConfigMap, drop python3 + shell JSON escape Three bugbot findings on the first ingestor-chart pass, all real: 1. HIGH — curlimages/curl runtime layer doesn't include python3 (only in the build stage; stripped in the final image). The hook's `python3 -c ...` JSON encoder would fail with "python3: not found" on every install. 2. HIGH — even if python3 were available, the shell syntax `python3 -c "..." VAR=value` puts the assignments AFTER the command, which makes them positional argv, not env. The `os.environ['INGEST_CONFIG']` lookup would raise KeyError. 3. MEDIUM — `nindent 4` after literal template-source indentation puts a leading blank line into the YAML block scalar, so the customer's ingest.yaml gets a "\n" prefix that block-scalar parsers tolerate but is wrong. Structural fix rather than tweaking the script: the three POST-body fields (ingest_config, idempotency_key, image_digest) are ALL known at helm-template time. Render the JSON body in the ConfigMap as `body.json` using Helm's `toJson` filter — which handles multi-line string escaping correctly — then the hook becomes a one-line `curl --data-binary @body.json`. No python3 needed, no shell-side JSON construction at all. Eliminates both HIGH bugs as a category, not just instance-by-instance. For bug 3: use the left-trim action delimiter (dash inside braces) before the `required ... | nindent 4` action so it eats the leading whitespace cleanly. Verified via `helm template` that the rendered `ingest.yaml` now starts cleanly with `apiVersion:`. Verified ──────── - helm lint passes on both client/ and ingestor/. - helm template renders the JSON body with correct escaping (multi-line YAML → "\n"-escaped scalar in JSON). - helm template renders ingest.yaml with no leading blank line. - helm unittest client/: 116/116 pass. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(#86): track ingestor/values.yaml (was silently .gitignored) bugbot caught a serious oversight: `ingestor/values.yaml` exists in the working tree but never made it into the repository. Every `git add ingestor/` silently dropped it because the repo's .gitignore at line 119 has `/*/values*.yaml` — an anti-leak pattern for operator values files — which matches `ingestor/values.yaml`. Without the file the chart is broken on `helm install`: every template references `.Values.hookImage.repository`, `.Values.jobsManager.endpoint`, etc., and Helm renders nil-pointer errors when the keys are absent. Two-line fix: - Add `!ingestor/values.yaml` to .gitignore (mirrors the existing `!client/values*.yaml` exception for the main chart). Documents *why* the exception exists, so a future cleanup pass doesn't re-introduce the bug. - Commit the actual values.yaml file with the defaults already referenced by the README and the templates. Local verification before pushing: helm template my-dataset ingestor/ --namespace tracebloc \ --set ingestConfig=... --set image.digest=sha256:... \ # renders ServiceAccount, ConfigMap, Job correctly. Lesson for future runs: `git add /` is *not* a verification that files were added — gitignore patterns can silently drop them. Should have verified with `git status` before commit; would have caught this before bugbot did. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .gitignore | 5 + .../templates/ingestion-authz-configmap.yaml | 27 ++++ client/templates/jobs-manager-deployment.yaml | 17 +++ client/templates/jobs-manager-service.yaml | 22 ++++ client/templates/rbac.yaml | 23 ++++ client/values.yaml | 26 ++++ ingestor/.helmignore | 18 +++ ingestor/Chart.yaml | 24 ++++ ingestor/README.md | 88 +++++++++++++ ingestor/templates/_helpers.tpl | 52 ++++++++ .../templates/configmap-ingest-config.yaml | 45 +++++++ ingestor/templates/post-install-job.yaml | 123 ++++++++++++++++++ ingestor/templates/serviceaccount.yaml | 15 +++ ingestor/values.yaml | 78 +++++++++++ 14 files changed, 563 insertions(+) create mode 100644 client/templates/ingestion-authz-configmap.yaml create mode 100644 client/templates/jobs-manager-service.yaml create mode 100644 ingestor/.helmignore create mode 100644 ingestor/Chart.yaml create mode 100644 ingestor/README.md create mode 100644 ingestor/templates/_helpers.tpl create mode 100644 ingestor/templates/configmap-ingest-config.yaml create mode 100644 ingestor/templates/post-install-job.yaml create mode 100644 ingestor/templates/serviceaccount.yaml create mode 100644 ingestor/values.yaml diff --git a/.gitignore b/.gitignore index 14e8cdb..245a241 100644 --- a/.gitignore +++ b/.gitignore @@ -118,6 +118,11 @@ venv.bak/ /values*.yaml /*/values*.yaml !client/values*.yaml +# The ingestor subchart's values.yaml is part of the chart shipped to +# customers — must be tracked despite matching the `/*/values*.yaml` +# anti-leak pattern above (which exists to keep operator-local values +# files out of the repo). +!ingestor/values.yaml secret-values.yaml test-template.yaml eks.diff diff --git a/client/templates/ingestion-authz-configmap.yaml b/client/templates/ingestion-authz-configmap.yaml new file mode 100644 index 0000000..f5a6dc4 --- /dev/null +++ b/client/templates/ingestion-authz-configmap.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + # client-runtime#21: defines which ServiceAccount(s) may call + # POST /internal/submit-ingestion-run on jobs-manager, scoped to which + # tables. Mounted into jobs-manager at /etc/tracebloc/ingestion-authz.yaml + # and read at startup (see submit_ingestion_run.load_authz_policy). + # + # Each entry in `ingestionAuthz.allowed` maps a (namespace, service_account) + # to a list of table-name prefixes. Omit `namespace` to default to the + # release's namespace — the common case where the ingestor subchart is + # installed alongside the tracebloc client. + name: {{ .Release.Name }}-ingestion-authz + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} +data: + ingestion-authz.yaml: | + allowed: + {{- range .Values.ingestionAuthz.allowed }} + - service_account: {{ .service_account | quote }} + namespace: {{ .namespace | default $.Release.Namespace | quote }} + table_prefixes: + {{- range .table_prefixes }} + - {{ . | quote }} + {{- end }} + {{- end }} diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index 374d845..e8d4d8e 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -46,11 +46,25 @@ spec: limits: cpu: {{ .Values.resources.jobsManager.limits.cpu | default "1000m" | quote }} memory: {{ .Values.resources.jobsManager.limits.memory | default "1Gi" | quote }} + ports: + # client-runtime#21: POST /internal/submit-ingestion-run. The + # ingestor subchart's post-install hook hits this through the + # jobs-manager Service. Cluster-internal only. + - name: http + containerPort: 8080 + protocol: TCP volumeMounts: - name: shared-volume mountPath: "/data/shared" - name: logs-volume mountPath: "/data/logs" + # ingestion-authz policy read by jobs-manager at startup. The + # subPath mount means an authz update via `helm upgrade` is + # picked up on the next jobs-manager restart, not hot-reloaded. + - name: ingestion-authz + mountPath: /etc/tracebloc/ingestion-authz.yaml + subPath: ingestion-authz.yaml + readOnly: true env: - name: CLIENT_ID valueFrom: @@ -151,4 +165,7 @@ spec: - name: logs-volume persistentVolumeClaim: claimName: {{ include "tracebloc.clientLogsPvc" . }} + - name: ingestion-authz + configMap: + name: {{ .Release.Name }}-ingestion-authz restartPolicy: Always diff --git a/client/templates/jobs-manager-service.yaml b/client/templates/jobs-manager-service.yaml new file mode 100644 index 0000000..d048d10 --- /dev/null +++ b/client/templates/jobs-manager-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + # client-runtime#21: exposes jobs-manager's HTTP ingestion endpoint + # (POST /internal/submit-ingestion-run) at a stable in-cluster name so + # the ingestor subchart's post-install hook (tracebloc/client#86) can + # POST without discovering Pod IPs. ClusterIP-only — there is no + # legitimate caller outside the cluster. + name: jobs-manager + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + app: manager +spec: + selector: + app: manager + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP + type: ClusterIP diff --git a/client/templates/rbac.yaml b/client/templates/rbac.yaml index 3f25a4f..3982a5c 100644 --- a/client/templates/rbac.yaml +++ b/client/templates/rbac.yaml @@ -30,6 +30,21 @@ rules: - apiGroups: ["metrics.k8s.io"] resources: ["pods"] verbs: ["get", "list"] + # client-runtime#21: jobs-manager's POST /internal/submit-ingestion-run + # endpoint authenticates callers (the ingestor subchart's post-install + # hook) via TokenReview, then creates a per-run ConfigMap (ingest.yaml) + # and Secret (BACKEND_TOKEN) before spawning the ingestor Job. + # + # TokenReview is a cluster-scoped Kubernetes API; the create verb cannot + # be granted by a namespace Role. This means the ingestion endpoint + # requires ``clusterScope: true``. The else branch below intentionally + # omits it. + - apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews"] + verbs: ["create"] + - apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -73,6 +88,14 @@ rules: - apiGroups: ["metrics.k8s.io"] resources: ["pods"] verbs: ["get", "list"] + # client-runtime#21: ingestion endpoint creates a per-run ConfigMap + # (ingest.yaml) and Secret (BACKEND_TOKEN) before spawning the Job. + # TokenReview is omitted here (cluster-scoped API; not grantable by a + # namespace Role). Customers using clusterScope: false won't be able + # to authenticate ingestion calls — see the ClusterRole branch above. + - apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding diff --git a/client/values.yaml b/client/values.yaml index 3ab6370..f077941 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -341,3 +341,29 @@ autoUpgrade: limits: cpu: "500m" memory: "256Mi" + +# ============================================================ +# Ingestion endpoint authorization (client-runtime#21) +# ============================================================ +# jobs-manager's POST /internal/submit-ingestion-run authenticates callers +# via Kubernetes TokenReview, then authorizes against this policy. Mapping +# is (namespace, service_account) → list of allowed table-name prefixes; +# "*" means any table. +# +# Default: allow the ingestor subchart's SA (named after that release) to +# ingest into any table. Customers tighten via overrides — e.g.: +# +# ingestionAuthz: +# allowed: +# - service_account: my-dataset-ingestor +# namespace: tracebloc +# table_prefixes: ["chest_xrays_", "tumors_"] +ingestionAuthz: + allowed: + # Default: the ingestor subchart's SA (named "ingestor") in the same + # namespace as the tracebloc release can ingest into any table. The + # `namespace` field is optional; when omitted the ConfigMap template + # substitutes `.Release.Namespace`. Customers tighten by adding more + # specific entries with explicit `namespace` and `table_prefixes`. + - service_account: ingestor + table_prefixes: ["*"] diff --git a/ingestor/.helmignore b/ingestor/.helmignore new file mode 100644 index 0000000..48556d6 --- /dev/null +++ b/ingestor/.helmignore @@ -0,0 +1,18 @@ +# Patterns to ignore when building chart packages. +.DS_Store +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +*.swp +*.bak +*.tmp +*.orig +*~ +.project +.idea/ +*.tmproj +.vscode/ diff --git a/ingestor/Chart.yaml b/ingestor/Chart.yaml new file mode 100644 index 0000000..87c37fc --- /dev/null +++ b/ingestor/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: ingestor +description: | + Helm chart for submitting a tracebloc data-ingestion run. Renders a + customer's ingest.yaml into a ConfigMap and runs a post-install hook + that POSTs it to jobs-manager's /internal/submit-ingestion-run. + jobs-manager validates the YAML, mints a backend token, and creates + the ingestor Job directly — this chart owns the config + hook + artifacts, not the resulting Job or its data. +type: application +version: 0.1.0 +appVersion: "0.3.0-rc1" +keywords: + - tracebloc + - kubernetes + - data-ingestion +home: https://github.com/tracebloc/client/tree/develop/ingestor +sources: + - https://github.com/tracebloc/client + - https://github.com/tracebloc/data-ingestors + - https://github.com/tracebloc/client-runtime +maintainers: + - name: tracebloc + email: support@tracebloc.io diff --git a/ingestor/README.md b/ingestor/README.md new file mode 100644 index 0000000..f7893cb --- /dev/null +++ b/ingestor/README.md @@ -0,0 +1,88 @@ +# tracebloc/ingestor Helm chart + +A thin chart that submits one data-ingestion run to your tracebloc client cluster. Wraps the `POST /internal/submit-ingestion-run` endpoint on jobs-manager (client-runtime#21) so the customer-facing UX is: + +```bash +helm install my-dataset tracebloc/ingestor \ + --namespace tracebloc \ + --set-file ingestConfig=./my-ingest.yaml \ + --set image.digest=sha256: +``` + +## What this chart owns + +| Resource | Owner | Lifecycle | +|---|---|---| +| `ConfigMap/-config` (holds `ingest.yaml`) | this chart | created by `helm install`, deleted by `helm uninstall` | +| `Job/-submit` (post-install hook that POSTs) | this chart | created post-install, removed before each `helm upgrade` | +| `ServiceAccount/` | this chart (optional, default true) | created by `helm install`, deleted by `helm uninstall` | +| `ConfigMap/ingest-config-` (per-run, mounted into the ingestor Pod) | **jobs-manager** | created by jobs-manager on accept; not managed by Helm | +| `Secret/ingest-token-` (per-run, holds `BACKEND_TOKEN`) | **jobs-manager** | same | +| `Job/ingest-job-` (the actual ingestor) | **jobs-manager** | same | +| **Ingested data** (rows in cluster-internal MySQL + metadata POSTed to the backend) | **the cluster** | persists past `helm uninstall` | + +**`helm uninstall my-dataset` will not delete the running ingestor Job or any ingested data.** It removes only the config + hook artifacts above. Document this with operators so they don't expect uninstall to act as a "cancel ingestion" button. + +## How the install works end-to-end + +1. `helm install` renders `ConfigMap/-config` containing the customer's `ingest.yaml` body. +2. Helm fires the `post-install` hook: a Job that runs as the chart's ServiceAccount. +3. The hook reads its SA token from the projected volume and the `ingest.yaml` from the ConfigMap mount. +4. The hook POSTs `{ ingest_config, idempotency_key, image_digest }` to `jobs-manager:8080/internal/submit-ingestion-run`. +5. **jobs-manager** validates the SA token via Kubernetes TokenReview, then checks the (SA, table) pair against the cluster's `ingestionAuthz` policy (a ConfigMap rendered by the parent `tracebloc/client` chart). +6. If authorized, jobs-manager validates the YAML against the `ingest.v1` JSON schema, mints a backend token, creates the per-run ConfigMap + Secret + Job, records the run for idempotency, and returns `201` (or `200` if this idempotency_key has been seen before). +7. The hook treats `2xx` as success and exits 0; `helm install` reports success. Non-`2xx` exits 1 and `helm install` fails with the response body in the output. + +The customer never builds an image. The customer never writes a Dockerfile. The customer writes ~8 lines of YAML. + +## Required values + +| Value | Description | +|---|---| +| `ingestConfig` | The full `ingest.yaml` body. **Set via `--set-file`** — the body almost always contains YAML special characters that don't survive `--set`. | +| `image.digest` | A `sha256:<64-hex>` digest of a `ghcr.io/tracebloc/ingestor` release. Tags are rejected by jobs-manager. See the [data-ingestors releases page](https://github.com/tracebloc/data-ingestors/releases) for current digests. | + +## Frequently-overridden values + +| Value | Default | When to override | +|---|---|---| +| `jobsManager.endpoint` | `http://jobs-manager.tracebloc.svc.cluster.local:8080` | The parent `tracebloc/client` release isn't in the `tracebloc` namespace, or you're testing against a port-forward. | +| `serviceAccount.name` | `ingestor` | The cluster's `ingestionAuthz` policy expects a different SA name. (Default matches the parent chart's default.) | +| `image.repository` | `ghcr.io/tracebloc/ingestor` | Air-gapped mirror. | +| `idempotencyKey` | `-` | You want strict at-most-once semantics across re-installs under the same release name. | +| `hookTimeoutSeconds` | `30` | Slow networks or large schemas. | + +See `values.yaml` for the full set. + +## Verifying after install + +```bash +# Helm-side artifacts (this chart's footprint): +kubectl -n tracebloc get configmap,job,serviceaccount -l app.kubernetes.io/instance=my-dataset + +# jobs-manager-side artifacts (the actual run): +kubectl -n tracebloc get jobs -l tracebloc.io/ingestion-run + +# Watch the ingestion run progress: +kubectl -n tracebloc logs -l tracebloc.io/ingestion-run --tail=-1 +``` + +## Uninstalling + +```bash +helm uninstall my-dataset --namespace tracebloc +``` + +Removes the chart's ConfigMap + hook Job + ServiceAccount. Does **not** remove the running ingestor Job, its outputs, or the metadata posted to the backend — those are owned by jobs-manager and the cluster respectively. + +To cancel an in-flight run, work with jobs-manager directly: + +```bash +kubectl -n tracebloc delete job -l tracebloc.io/ingestion-run= +``` + +## Related + +- [tracebloc/data-ingestors](https://github.com/tracebloc/data-ingestors) — the ingestor image and YAML schema. +- [tracebloc/client-runtime#21](https://github.com/tracebloc/client-runtime/pull/35) — the `submit-ingestion-run` endpoint this chart calls. +- [tracebloc/client](https://github.com/tracebloc/client) — the parent chart that runs jobs-manager and renders the `ingestionAuthz` policy. diff --git a/ingestor/templates/_helpers.tpl b/ingestor/templates/_helpers.tpl new file mode 100644 index 0000000..1d1bcf8 --- /dev/null +++ b/ingestor/templates/_helpers.tpl @@ -0,0 +1,52 @@ +{{- /* +Shared template helpers for the tracebloc/ingestor chart. + +Naming: + {release}-config — ConfigMap holding the ingest.yaml the post-install + hook reads + POSTs. + {release}-submit — Job that runs the helm post-install hook (the POST). + Resulting ingestor Job created by jobs-manager has + its own name (idempotency-key-derived) and is NOT + managed by this chart. +*/ -}} + +{{- define "ingestor.fullname" -}} +{{ .Release.Name }} +{{- end -}} + +{{- define "ingestor.configMapName" -}} +{{ .Release.Name }}-config +{{- end -}} + +{{- define "ingestor.hookJobName" -}} +{{ .Release.Name }}-submit +{{- end -}} + +{{- define "ingestor.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} +{{ .Values.serviceAccount.name }} +{{- else -}} +{{ .Values.serviceAccount.name | required "serviceAccount.name is required when serviceAccount.create=false" }} +{{- end -}} +{{- end -}} + +{{- /* +Resolved idempotency key. Defaults to "-" so each +helm install / upgrade submits a fresh run; explicit override is +honored verbatim. +*/ -}} +{{- define "ingestor.idempotencyKey" -}} +{{- if .Values.idempotencyKey -}} +{{ .Values.idempotencyKey }} +{{- else -}} +{{ printf "%s-%d" .Release.Name .Release.Revision }} +{{- end -}} +{{- end -}} + +{{- define "ingestor.labels" -}} +app.kubernetes.io/name: ingestor +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | quote }} +{{- end -}} diff --git a/ingestor/templates/configmap-ingest-config.yaml b/ingestor/templates/configmap-ingest-config.yaml new file mode 100644 index 0000000..ccdf2e5 --- /dev/null +++ b/ingestor/templates/configmap-ingest-config.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + # Holds the customer's ingest.yaml and the pre-rendered JSON body that + # the post-install hook POSTs. Both keys are populated at helm-template + # time; the runtime hook never has to construct JSON itself, which + # eliminates an entire class of shell-escaping bugs (bugbot caught the + # original python3-based encoder). + # + # Lifecycle: created by `helm install`, removed by `helm uninstall`. + # The resulting ingestor Job (created by jobs-manager) does NOT depend + # on this ConfigMap — jobs-manager creates its own per-Job-named + # ConfigMap before spawning the Job — so deleting this one mid-run + # does not interrupt ingestion. + name: {{ include "ingestor.configMapName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "ingestor.labels" . | nindent 4 }} + annotations: + # Hook ordering: this ConfigMap must exist before the post-install + # hook Job mounts it. helm.sh/hook-weight orders sibling hook + # resources; lower runs first. + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "0" +data: + # Verbatim ingest.yaml body (useful for `kubectl get cm -o yaml` + # debugging). The left-trim action delimiter (dash inside the action + # braces) eats the whitespace before the action so we don't render a + # leading blank line into the block scalar — bugbot caught this on + # the first pass; without the trim, the customer's YAML got a leading + # "\n" prefix that block-scalar parsers tolerate but is still wrong. + ingest.yaml: |- + {{- required "ingestConfig must be set (use --set-file ingestConfig=path/to/ingest.yaml)" .Values.ingestConfig | nindent 4 }} + # Pre-rendered JSON body that the post-install hook POSTs to + # jobs-manager. Computed at helm-template time so the hook is a + # one-line `curl --data-binary @body.json` — no python, no jq, no + # shell-escaping the multi-line YAML scalar. The required-value gates + # below fire here so a missing digest or ingest config fails + # templating (and therefore `helm install`) with a clear message + # before any cluster state is touched. + body.json: |- + {{- $cfg := required "ingestConfig must be set (use --set-file ingestConfig=path/to/ingest.yaml)" .Values.ingestConfig -}} + {{- $digest := required "image.digest must be a sha256: digest of the ghcr.io/tracebloc/ingestor image (see the data-ingestors GitHub Release for current digests)" .Values.image.digest -}} + {{- $key := include "ingestor.idempotencyKey" . -}} + {{- dict "ingest_config" $cfg "idempotency_key" $key "image_digest" $digest | toJson | nindent 4 }} diff --git a/ingestor/templates/post-install-job.yaml b/ingestor/templates/post-install-job.yaml new file mode 100644 index 0000000..11d2c7d --- /dev/null +++ b/ingestor/templates/post-install-job.yaml @@ -0,0 +1,123 @@ +apiVersion: batch/v1 +kind: Job +metadata: + # client#86: post-install hook that POSTs the ingest.yaml to + # jobs-manager. Hook lifecycle: + # + # - post-install + post-upgrade fires the Job. + # - hook-weight 1 ensures the ConfigMap (weight 0) is in place first. + # - hook-delete-policy "before-hook-creation" cleans up the previous + # Job on a re-install of the same release, so we don't trip on a + # leftover completed Job. We DO NOT use "hook-succeeded" because + # keeping the Job around lets operators read the POST result via + # `kubectl logs` after install. + name: {{ include "ingestor.hookJobName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "ingestor.labels" . | nindent 4 }} + annotations: + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "1" + helm.sh/hook-delete-policy: before-hook-creation +spec: + backoffLimit: 0 + # Bound: jobs-manager validates synchronously; if it hasn't responded + # in this window something is genuinely wrong and the install should + # fail loudly rather than hang. + activeDeadlineSeconds: {{ .Values.hookTimeoutSeconds | default 30 }} + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + {{- include "ingestor.labels" . | nindent 8 }} + spec: + # The hook talks to one in-cluster endpoint with one specific SA + # token. Hardened defaults match the tracebloc client chart's + # training-pod template — non-root, all caps dropped. + automountServiceAccountToken: true # we DO need the SA token to authenticate the POST + serviceAccountName: {{ include "ingestor.serviceAccountName" . }} + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + containers: + - name: submit + image: {{ printf "%s:%s" .Values.hookImage.repository .Values.hookImage.tag | quote }} + imagePullPolicy: {{ .Values.hookImage.pullPolicy | default "IfNotPresent" }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + resources: + {{- toYaml .Values.hookResources | nindent 12 }} + env: + - name: JOBS_MANAGER_ENDPOINT + value: {{ .Values.jobsManager.endpoint | quote }} + - name: SUBMIT_PATH + value: {{ .Values.jobsManager.submitPath | quote }} + command: ["/bin/sh", "-c"] + # The script: + # 1. Reads the SA token from the projected volume (k8s adds + # it automatically when automountServiceAccountToken: true). + # 2. POSTs the pre-rendered JSON body from + # /etc/ingest/body.json — Helm built the JSON at template + # time so the script doesn't have to construct or escape + # anything itself. Earlier versions used a python3 / jq + # escape pipeline at runtime; both failed in different + # ways (curlimages/curl strips python3 from the runtime + # layer, and shell-side JSON escaping a multi-line YAML + # scalar is its own bug farm). Bugbot caught both. + # 3. Non-2xx fails the Job, which fails the helm install with + # the response body visible in the install output. + args: + - | + set -eu + + TOKEN_PATH=/var/run/secrets/kubernetes.io/serviceaccount/token + BODY_FILE=/etc/ingest/body.json + + if [ ! -r "$TOKEN_PATH" ]; then + echo "ERROR: SA token not found at $TOKEN_PATH" >&2 + exit 1 + fi + if [ ! -r "$BODY_FILE" ]; then + echo "ERROR: request body not found at $BODY_FILE — ConfigMap mount failed" >&2 + exit 1 + fi + + URL="${JOBS_MANAGER_ENDPOINT}${SUBMIT_PATH}" + echo "POST $URL" + echo "Body source: $BODY_FILE" + + HTTP_STATUS=$(curl -sS -o /tmp/resp.json -w "%{http_code}" \ + --max-time {{ .Values.hookTimeoutSeconds | default 30 }} \ + -X POST "$URL" \ + -H "Authorization: Bearer $(cat "$TOKEN_PATH")" \ + -H "Content-Type: application/json" \ + --data-binary "@$BODY_FILE") + + echo "HTTP $HTTP_STATUS" + echo "Response:" + cat /tmp/resp.json + echo + + # 200 = replay (existing run for this key); 201 = freshly + # created. Both are success for the customer's helm install. + if [ "$HTTP_STATUS" -lt 200 ] || [ "$HTTP_STATUS" -ge 300 ]; then + exit 1 + fi + volumeMounts: + - name: ingest-config + mountPath: /etc/ingest + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: ingest-config + configMap: + name: {{ include "ingestor.configMapName" . }} + - name: tmp + emptyDir: {} diff --git a/ingestor/templates/serviceaccount.yaml b/ingestor/templates/serviceaccount.yaml new file mode 100644 index 0000000..2a0cede --- /dev/null +++ b/ingestor/templates/serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + # The post-install hook Job runs as this SA. Its token is the credential + # jobs-manager validates via TokenReview (client-runtime#21). The SA's + # (namespace, name) must match an entry in the tracebloc release's + # `ingestionAuthz.allowed` ConfigMap; the default chart entry is + # `(release-namespace, "ingestor")` so this default name works + # out-of-the-box. + name: {{ include "ingestor.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "ingestor.labels" . | nindent 4 }} +{{- end }} diff --git a/ingestor/values.yaml b/ingestor/values.yaml new file mode 100644 index 0000000..3a77f7b --- /dev/null +++ b/ingestor/values.yaml @@ -0,0 +1,78 @@ +# ============================================================ +# tracebloc/ingestor — values.yaml +# ============================================================ +# Customer-facing chart. The dominant install is: +# +# helm install my-dataset tracebloc/ingestor --namespace tracebloc \ +# --set-file ingestConfig=./my-ingest.yaml \ +# --set image.digest=sha256: +# +# The chart submits the YAML to jobs-manager; jobs-manager owns the +# resulting Job. `helm uninstall` removes the ConfigMap + hook but +# leaves the Job and ingested data alone (see README.md). + +# -- The ingest.yaml body. REQUIRED. Set via --set-file ingestConfig=path/to/ingest.yaml +# or by overriding in a values file. Validated by jobs-manager against the +# v1 schema from tracebloc/data-ingestors#44; install fails with a +# line-numbered error if invalid. +ingestConfig: "" + +# -- Image of the official tracebloc/ingestor that will run the dataset. +image: + # Repository defaults to the GHCR-published image. Customers running an + # air-gapped mirror override this. + repository: ghcr.io/tracebloc/ingestor + # Digest pinning is REQUIRED — jobs-manager rejects tag-form inputs + # outright (see client-runtime#21's image_digest validation). Set this + # to the digest published in the data-ingestors GitHub Release for the + # version you want to ingest with. + digest: "" + +# -- jobs-manager Service hostname + port to POST to. Defaults assume +# the tracebloc client chart's release name is "tracebloc". Override if +# you've named it differently (e.g., "my-tracebloc..svc..."). +jobsManager: + endpoint: http://jobs-manager.tracebloc.svc.cluster.local:8080 + # The path on jobs-manager that accepts ingestion submissions. Stable + # API; only change if you're testing against a forked jobs-manager. + submitPath: /internal/submit-ingestion-run + +# -- ServiceAccount that the post-install hook runs as. Its token is the +# credential jobs-manager validates via TokenReview, and the SA's name + +# namespace are matched against the ingestionAuthz policy in the +# tracebloc client release. Default name matches the chart's default +# `ingestionAuthz.allowed[0].service_account: ingestor` entry. +serviceAccount: + create: true + name: ingestor + +# -- Image for the post-install hook Job itself (a thin curl wrapper). +# Pinned by digest for reproducibility; bumping is a separate chart +# release. +hookImage: + # curlimages/curl is small, official, and pinned by tag here for + # readability. Customers who need digest pinning override this. + repository: curlimages/curl + tag: "8.10.1" + pullPolicy: IfNotPresent + +# -- Resources for the post-install hook. Tiny — it makes one POST. +hookResources: + requests: + cpu: "10m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + +# -- Idempotency key used by jobs-manager to dedupe submissions. Derived +# from the Helm release name + revision by default so a `helm upgrade` +# under the same release submits a fresh run. Override to a stable UUID +# if you want strict at-most-once semantics across reinstalls of the +# same release name. +idempotencyKey: "" + +# -- How long the post-install hook waits for the POST to return before +# failing the install. Includes jobs-manager schema validation + +# k8s API calls; usually completes in <2s. +hookTimeoutSeconds: 30 From 828a8f53a4d88ba93a5e9b1900b489e5cd81cb11 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Mon, 18 May 2026 18:53:01 +0500 Subject: [PATCH 15/24] fix: nil-guard ingestionAuthz access for --reuse-values upgrade path (#124) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #123's ingestion-authz ConfigMap template did unguarded nested access: {{- range .Values.ingestionAuthz.allowed }} This crashes with "nil pointer evaluating interface {}.allowed" when `.Values.ingestionAuthz` is absent — which is exactly what `helm upgrade --reuse-values` produces against a pre-#123 release. The stored values from the previous deploy don't have the key, and `--reuse-values` doesn't pick up new chart defaults, so the upgrade fails before any of the new resources are created. A real user hit this immediately after #123 merged: Error: UPGRADE FAILED: template: client/templates/ ingestion-authz-configmap.yaml:20:21: executing "..." at <.Values.ingestionAuthz.allowed>: nil pointer evaluating interface {}.allowed Fix: collapse the missing-parent and missing-child cases to an empty list with `default dict` + `default list`. The rendered ConfigMap becomes `allowed:` (empty), which the authz policy parser treats as "no SAs authorized" — fail-safe, matches the intent of "operator hasn't configured this yet". The recommended `helm upgrade` recipe is still `--reset-then-reuse-values` (picks up new defaults including the non-empty `ingestionAuthz.allowed` default), but the template no longer requires that — it renders correctly under either path. Verified ──────── - helm template renders cleanly with default values (full policy), with `--set ingestionAuthz=null` (empty allowed list), and with `--set ingestionAuthz.allowed=null` (same). - helm unittest client/: 116/116 pass, no snapshot changes. Co-authored-by: Claude Opus 4.7 (1M context) --- client/templates/ingestion-authz-configmap.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/client/templates/ingestion-authz-configmap.yaml b/client/templates/ingestion-authz-configmap.yaml index f5a6dc4..f8a1e74 100644 --- a/client/templates/ingestion-authz-configmap.yaml +++ b/client/templates/ingestion-authz-configmap.yaml @@ -17,7 +17,17 @@ metadata: data: ingestion-authz.yaml: | allowed: - {{- range .Values.ingestionAuthz.allowed }} + {{- /* + Nil-guarded chain: an upgrade with `--reuse-values` from a + pre-#123 release won't have `.Values.ingestionAuthz` in its + stored values, and an unguarded `.Values.ingestionAuthz.allowed` + crashes with "nil pointer evaluating interface {}.allowed". + `default dict` + `default list` collapse the missing parent / + missing child to an empty list, which renders as `allowed: []` + — fail-safe (the authz policy then denies every caller, which + is correct: there's no policy until the operator sets one). + */ -}} + {{- range default list (default dict .Values.ingestionAuthz).allowed }} - service_account: {{ .service_account | quote }} namespace: {{ .namespace | default $.Release.Namespace | quote }} table_prefixes: From cb0db442c3da263af7f45c6f689a44f1fef53c89 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Tue, 19 May 2026 11:19:42 +0500 Subject: [PATCH 16/24] feat(#125): wire INGESTOR_IMAGE_DIGEST; drop digest requirement from ingestor subchart (#126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(#125): wire INGESTOR_IMAGE_DIGEST; drop digest requirement from ingestor subchart Companion to tracebloc/client-runtime#41 (which made the endpoint treat the request body's `image_digest` as an optional override of a cluster-configured default). With this PR the ingestor image fits the same auto-update model as every other component in the chart: client/values.yaml + images.ingestor.digest: "" The auto-upgrade cronjob bumps this when a new chart version is published; jobs-manager re-rolls and the new env takes effect. client/templates/jobs-manager-deployment.yaml + INGESTOR_IMAGE_DIGEST env, nil-guarded for --reuse-values from a pre-this-PR release. Empty value renders cleanly (no nil pointer), endpoint then accepts only request-body overrides until the operator sets the chart value. ingestor/values.yaml + templates/configmap-ingest-config.yaml + image.digest is now an OPTIONAL override, not required. + body.json renders without `image_digest` when none is set; the key is included only when the customer explicitly pinned via --set image.digest=... (the override path: reproducing old runs, testing pre-rollout versions, air-gapped mirrors). ingestor/README.md + Removes image.digest from "Required values". + Adds "Pinning a specific image version" section explaining the override use cases and when to reach for them. + Top-of-README install snippet drops --set image.digest=... — the dominant path is now `helm install --set-file ingestConfig=...`. Once both PRs land, the bootstrap step is a one-line bump of client/values.yaml's images.ingestor.digest to the current ghcr.io/tracebloc/ingestor release digest, plus a chart version bump so the auto-upgrade cronjob promotes it. Future ingestor releases follow the same pattern — bump digest + chart version, customers' auto-upgrade picks it up on the next tick. Verified ──────── - helm lint passes on both charts. - helm template renders: - env populated when images.ingestor.digest is set - env empty (nil-guard) when images.ingestor key absent entirely (simulates --reuse-values from pre-this-PR release) - body.json without image_digest when no override - body.json with image_digest when explicit --set image.digest=... - helm unittest client/: 116/116 pass. Closes #125 Co-Authored-By: Claude Opus 4.7 (1M context) * chore: bootstrap ingestor digest + bump chart version 1.3.2 → 1.3.3 Activates the auto-update model introduced by the rest of this PR. Without the value set, jobs-manager runs with `INGESTOR_IMAGE_DIGEST=""` and the ingestion endpoint returns 503 for every call that doesn't include a body override — which is the *opposite* of the "customer doesn't have to think about digests" UX this PR is supposed to enable. Two coupled bumps: client/Chart.yaml version: 1.3.2 → 1.3.3 appVersion: 1.3.2 → 1.3.3 Required for the auto-upgrade cronjob to detect this release. `helm search repo` orders by version; without a bump customers stay on 1.3.2 and never see the new env wiring. client/values.yaml images.ingestor.digest = "sha256:e6639b084d0d377072dc908db376050914ebd49c730ddaa13f838d10f5482ea9" The data-ingestors v0.3.0-rc1 release. Future ingestor releases bump both this and Chart.yaml's version; eventually a workflow in tracebloc/data-ingestors can raise the PR automatically when a new image is published. After this lands and the chart is published to gh-pages, a `helm upgrade --reset-then-reuse-values` on the customer's cluster (or the daily auto-upgrade cronjob's next tick) rolls jobs-manager with the env populated, and `helm install tracebloc/ingestor --set-file ingestConfig=...` — no `--set image.digest=...` — works. Verified ──────── - helm lint client/ clean. - helm template shows INGESTOR_IMAGE_DIGEST env populated with the real digest. - helm unittest client/: 116/116 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- client/Chart.yaml | 4 ++-- client/templates/jobs-manager-deployment.yaml | 9 ++++++++ client/values.yaml | 15 +++++++++++++ ingestor/README.md | 22 ++++++++++++++----- .../templates/configmap-ingest-config.yaml | 20 ++++++++++++----- ingestor/values.yaml | 18 +++++++++++---- 6 files changed, 71 insertions(+), 17 deletions(-) diff --git a/client/Chart.yaml b/client/Chart.yaml index 23fecef..cd51c83 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.3.2 -appVersion: "1.3.2" +version: 1.3.3 +appVersion: "1.3.3" keywords: - tracebloc - kubernetes diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index e8d4d8e..95dea16 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -82,6 +82,15 @@ spec: value: {{ include "tracebloc.clientLogsPvc" . | quote }} - name: MYSQL_HOST value: "mysql-client" + # client-runtime#40: default ingestor image digest for the + # POST /internal/submit-ingestion-run endpoint. Auto-upgrade + # keeps this current; the ingestor subchart no longer requires + # the customer to pin a digest. Nil-guarded so `--reuse-values` + # from a pre-this-PR release doesn't crash templating on the + # missing `images.ingestor` key — empty value means jobs-manager + # accepts only request-body overrides and returns 503 if absent. + - name: INGESTOR_IMAGE_DIGEST + value: {{ (default dict .Values.images.ingestor).digest | default "" | quote }} - name: REQUESTS_PROXY_URL value: "http://requests-proxy-service:8888" - name: JOB_IMAGE_HOST diff --git a/client/values.yaml b/client/values.yaml index f077941..0c40385 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -179,6 +179,21 @@ networkPolicy: images: jobsManager: digest: "" + # client-runtime#40 / client#125: the ingestor image is spawned by + # jobs-manager at ingestion-submission time, not as a long-lived pod. + # Setting `digest` here surfaces it into jobs-manager as the + # `INGESTOR_IMAGE_DIGEST` env; the auto-upgrade flow then keeps it + # current. Customers can override per-install via the ingestor + # subchart's `image.digest` (for pinning / debugging), but the + # dominant path uses this value. + # + # Initial value: ghcr.io/tracebloc/ingestor@. Bump + # this on each ingestor release; chart `version` in Chart.yaml must + # also bump so the auto-upgrade cronjob detects the change. Future + # automation in tracebloc/data-ingestors (release-image.yml) can + # raise a PR to this file when a new image is published. + ingestor: + digest: "sha256:e6639b084d0d377072dc908db376050914ebd49c730ddaa13f838d10f5482ea9" podsMonitor: digest: "" resourceMonitor: diff --git a/ingestor/README.md b/ingestor/README.md index f7893cb..56a9e51 100644 --- a/ingestor/README.md +++ b/ingestor/README.md @@ -5,10 +5,11 @@ A thin chart that submits one data-ingestion run to your tracebloc client cluste ```bash helm install my-dataset tracebloc/ingestor \ --namespace tracebloc \ - --set-file ingestConfig=./my-ingest.yaml \ - --set image.digest=sha256: + --set-file ingestConfig=./my-ingest.yaml ``` +**The ingestor image is managed centrally** by the tracebloc client chart's auto-upgrade flow — you don't need to pin a digest for each install. New ingestor releases roll out automatically when the cluster's daily auto-upgrade cronjob (`autoUpgrade.enabled: true` in the client chart) bumps the chart version. See [Pinning a specific image version](#pinning-a-specific-image-version) below for the override path. + ## What this chart owns | Resource | Owner | Lifecycle | @@ -28,9 +29,9 @@ helm install my-dataset tracebloc/ingestor \ 1. `helm install` renders `ConfigMap/-config` containing the customer's `ingest.yaml` body. 2. Helm fires the `post-install` hook: a Job that runs as the chart's ServiceAccount. 3. The hook reads its SA token from the projected volume and the `ingest.yaml` from the ConfigMap mount. -4. The hook POSTs `{ ingest_config, idempotency_key, image_digest }` to `jobs-manager:8080/internal/submit-ingestion-run`. +4. The hook POSTs `{ ingest_config, idempotency_key }` (and `image_digest` if you explicitly pinned one) to `jobs-manager:8080/internal/submit-ingestion-run`. 5. **jobs-manager** validates the SA token via Kubernetes TokenReview, then checks the (SA, table) pair against the cluster's `ingestionAuthz` policy (a ConfigMap rendered by the parent `tracebloc/client` chart). -6. If authorized, jobs-manager validates the YAML against the `ingest.v1` JSON schema, mints a backend token, creates the per-run ConfigMap + Secret + Job, records the run for idempotency, and returns `201` (or `200` if this idempotency_key has been seen before). +6. If authorized, jobs-manager validates the YAML against the `ingest.v1` JSON schema, resolves the image digest (the body's value if you pinned one, otherwise the cluster's configured default from `INGESTOR_IMAGE_DIGEST`), mints a backend token, creates the per-run ConfigMap + Secret + Job, records the run for idempotency, and returns `201` (or `200` if this idempotency_key has been seen before). 7. The hook treats `2xx` as success and exits 0; `helm install` reports success. Non-`2xx` exits 1 and `helm install` fails with the response body in the output. The customer never builds an image. The customer never writes a Dockerfile. The customer writes ~8 lines of YAML. @@ -40,7 +41,18 @@ The customer never builds an image. The customer never writes a Dockerfile. The | Value | Description | |---|---| | `ingestConfig` | The full `ingest.yaml` body. **Set via `--set-file`** — the body almost always contains YAML special characters that don't survive `--set`. | -| `image.digest` | A `sha256:<64-hex>` digest of a `ghcr.io/tracebloc/ingestor` release. Tags are rejected by jobs-manager. See the [data-ingestors releases page](https://github.com/tracebloc/data-ingestors/releases) for current digests. | + +## Pinning a specific image version + +The dominant install path leaves `image.digest` empty and lets jobs-manager pick the cluster's current ingestor version (set by the parent client chart's `images.ingestor.digest`, kept current by the auto-upgrade cronjob). Override only when you have a specific reason: + +| Scenario | What to do | +|---|---| +| Reproducing an older ingestion run for audit / debugging | `--set image.digest=sha256:` | +| Testing a new ingestor release before cluster-wide rollout | `--set image.digest=sha256:` ahead of the auto-upgrade tick | +| Air-gapped mirror with frozen versions | Use both `--set image.repository=...` and `--set image.digest=sha256:...` | + +When set, the digest must be the full canonical form (`sha256:` + 64 lowercase hex chars). Tags like `v0.3.0` are rejected by jobs-manager. See the [data-ingestors releases page](https://github.com/tracebloc/data-ingestors/releases) for current digests. ## Frequently-overridden values diff --git a/ingestor/templates/configmap-ingest-config.yaml b/ingestor/templates/configmap-ingest-config.yaml index ccdf2e5..beca6ac 100644 --- a/ingestor/templates/configmap-ingest-config.yaml +++ b/ingestor/templates/configmap-ingest-config.yaml @@ -34,12 +34,20 @@ data: # Pre-rendered JSON body that the post-install hook POSTs to # jobs-manager. Computed at helm-template time so the hook is a # one-line `curl --data-binary @body.json` — no python, no jq, no - # shell-escaping the multi-line YAML scalar. The required-value gates - # below fire here so a missing digest or ingest config fails - # templating (and therefore `helm install`) with a clear message - # before any cluster state is touched. + # shell-escaping the multi-line YAML scalar. + # + # `image_digest` is OPTIONAL in the POST body (client-runtime#40): + # when omitted, jobs-manager uses the cluster's configured default + # (set by the tracebloc client chart's `images.ingestor.digest`, + # kept current by the auto-upgrade flow). Including it here only + # when the customer explicitly pinned via `--set image.digest=...` + # means the dominant install path tracks the cluster's current + # version automatically; the override path is preserved. body.json: |- {{- $cfg := required "ingestConfig must be set (use --set-file ingestConfig=path/to/ingest.yaml)" .Values.ingestConfig -}} - {{- $digest := required "image.digest must be a sha256: digest of the ghcr.io/tracebloc/ingestor image (see the data-ingestors GitHub Release for current digests)" .Values.image.digest -}} {{- $key := include "ingestor.idempotencyKey" . -}} - {{- dict "ingest_config" $cfg "idempotency_key" $key "image_digest" $digest | toJson | nindent 4 }} + {{- $body := dict "ingest_config" $cfg "idempotency_key" $key -}} + {{- if .Values.image.digest -}} + {{- $_ := set $body "image_digest" .Values.image.digest -}} + {{- end -}} + {{- $body | toJson | nindent 4 }} diff --git a/ingestor/values.yaml b/ingestor/values.yaml index 3a77f7b..4c7f238 100644 --- a/ingestor/values.yaml +++ b/ingestor/values.yaml @@ -18,14 +18,24 @@ ingestConfig: "" # -- Image of the official tracebloc/ingestor that will run the dataset. +# +# Default behaviour (client-runtime#40 / client#125): leave `digest` +# empty and let the cluster's auto-upgrade flow pick the version. +# jobs-manager reads `INGESTOR_IMAGE_DIGEST` (set by the tracebloc +# client chart's `images.ingestor.digest` value) and uses it when +# spawning the ingestor Job. New ingestor releases roll out +# automatically when the cluster's daily auto-upgrade cronjob bumps the +# chart — no per-dataset chart change needed. +# +# Set `digest` here ONLY when you need to pin a specific version, e.g. +# reproducing an older run or testing a new release before cluster-wide +# rollout. Must be the full canonical sha256 digest; tags are rejected. image: # Repository defaults to the GHCR-published image. Customers running an # air-gapped mirror override this. repository: ghcr.io/tracebloc/ingestor - # Digest pinning is REQUIRED — jobs-manager rejects tag-form inputs - # outright (see client-runtime#21's image_digest validation). Set this - # to the digest published in the data-ingestors GitHub Release for the - # version you want to ingest with. + # Optional override. Empty (default) means "use whatever the cluster + # is currently rolling". When set must be sha256:<64 lowercase hex>. digest: "" # -- jobs-manager Service hostname + port to POST to. Defaults assume From 7a70883af6636de254f83a6a543a77901e9cbd87 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Tue, 19 May 2026 13:24:36 +0500 Subject: [PATCH 17/24] fix(#127): ingestor chart auto-resolves jobs-manager endpoint to release namespace (#128) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ingestor subchart's default jobsManager.endpoint hardcoded "tracebloc" as the parent release's namespace: http://jobs-manager.tracebloc.svc.cluster.local:8080 Any release in a non-"tracebloc" namespace failed the post-install hook with `curl: (6) Could not resolve host: …`, blocking end-to-end ingestion. Surfaced today during real-cluster validation on a release deployed to `tracebloc-templates`. Fix shape: leave the values.yaml default empty; have the post-install hook template the endpoint to use `.Release.Namespace` when no value is set. The override path (cross-namespace install) keeps working — set `jobsManager.endpoint` explicitly and it wins over the default. values.yaml jobsManager.endpoint: "" (was hardcoded to tracebloc namespace) + comment explaining the auto-resolve + override semantics templates/post-install-job.yaml JOBS_MANAGER_ENDPOINT defaults to http://jobs-manager.<.Release.Namespace>.svc.cluster.local:8080 when .Values.jobsManager.endpoint is empty. README.md Frequently-overridden-values entry corrected. Verified ──────── - helm template into namespace `tracebloc-templates` → http://jobs-manager.tracebloc-templates.svc.cluster.local:8080 - helm template into namespace `some-other-ns` → http://jobs-manager.some-other-ns.svc.cluster.local:8080 - helm template with --set jobsManager.endpoint=http://port-forward.localhost:8888 → wins over the default. - helm lint clean. Closes #127 Co-authored-by: Claude Opus 4.7 (1M context) --- ingestor/README.md | 2 +- ingestor/templates/post-install-job.yaml | 9 ++++++++- ingestor/values.yaml | 12 ++++++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/ingestor/README.md b/ingestor/README.md index 56a9e51..0524003 100644 --- a/ingestor/README.md +++ b/ingestor/README.md @@ -58,7 +58,7 @@ When set, the digest must be the full canonical form (`sha256:` + 64 lowercase h | Value | Default | When to override | |---|---|---| -| `jobsManager.endpoint` | `http://jobs-manager.tracebloc.svc.cluster.local:8080` | The parent `tracebloc/client` release isn't in the `tracebloc` namespace, or you're testing against a port-forward. | +| `jobsManager.endpoint` | `http://jobs-manager..svc.cluster.local:8080` (auto-resolved) | The ingestor release and the parent `tracebloc/client` release live in different namespaces, or you're testing against a port-forward. | | `serviceAccount.name` | `ingestor` | The cluster's `ingestionAuthz` policy expects a different SA name. (Default matches the parent chart's default.) | | `image.repository` | `ghcr.io/tracebloc/ingestor` | Air-gapped mirror. | | `idempotencyKey` | `-` | You want strict at-most-once semantics across re-installs under the same release name. | diff --git a/ingestor/templates/post-install-job.yaml b/ingestor/templates/post-install-job.yaml index 11d2c7d..6c650c5 100644 --- a/ingestor/templates/post-install-job.yaml +++ b/ingestor/templates/post-install-job.yaml @@ -54,8 +54,15 @@ spec: resources: {{- toYaml .Values.hookResources | nindent 12 }} env: + # client#127: default to the jobs-manager Service in this + # release's own namespace rather than hardcoding "tracebloc". + # Releases in any other namespace would otherwise hit + # `curl: Could not resolve host` at the hook. Customers + # whose ingestor release sits in a different namespace from + # the tracebloc release set `jobsManager.endpoint` + # explicitly to override. - name: JOBS_MANAGER_ENDPOINT - value: {{ .Values.jobsManager.endpoint | quote }} + value: {{ .Values.jobsManager.endpoint | default (printf "http://jobs-manager.%s.svc.cluster.local:8080" .Release.Namespace) | quote }} - name: SUBMIT_PATH value: {{ .Values.jobsManager.submitPath | quote }} command: ["/bin/sh", "-c"] diff --git a/ingestor/values.yaml b/ingestor/values.yaml index 4c7f238..e5f8e12 100644 --- a/ingestor/values.yaml +++ b/ingestor/values.yaml @@ -38,11 +38,15 @@ image: # is currently rolling". When set must be sha256:<64 lowercase hex>. digest: "" -# -- jobs-manager Service hostname + port to POST to. Defaults assume -# the tracebloc client chart's release name is "tracebloc". Override if -# you've named it differently (e.g., "my-tracebloc..svc..."). +# -- jobs-manager Service hostname + port to POST to. When left empty +# (default), the post-install hook resolves the endpoint to +# `http://jobs-manager..svc.cluster.local:8080`, +# which is correct for the common case where the ingestor release is +# installed in the same namespace as the tracebloc client release. +# Override only when the two releases live in different namespaces, or +# when you need a non-standard port/host (e.g., port-forward testing). jobsManager: - endpoint: http://jobs-manager.tracebloc.svc.cluster.local:8080 + endpoint: "" # The path on jobs-manager that accepts ingestion submissions. Stable # API; only change if you're testing against a forked jobs-manager. submitPath: /internal/submit-ingestion-run From f9d870be90bc47a8edd70cd48c2832009ddec02d Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Tue, 19 May 2026 19:33:21 +0500 Subject: [PATCH 18/24] feat(#129): parent client chart owns the shared ingestor ServiceAccount (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ingestor ServiceAccount is shared by every `tracebloc/ingestor` subchart release in a namespace, but it was owned by the first such release. Concurrent installs of a second ingestor release collided with Helm's "cannot import into current release"; uninstalling the first release ripped the SA out from under all the others. Move the SA into this parent chart, which already owns the matching `ingestionAuthz` ConfigMap, so the SA + policy have the same lifecycle and every ingestor release in the namespace shares the SA cleanly. Plumb the name through `ingestionAuthz.serviceAccountName` as a single source of truth — both the new SA template and the default `allowed` entry in the authz ConfigMap dereference it via the new `tracebloc.ingestorServiceAccountName` helper. The helper nil-guards pre-#129 `--reuse-values` upgrades by defaulting to "ingestor". Document the SA adoption path in `client/MIGRATION.md` for clusters that already have an `ingestor` SA owned by a 0.1.0 subchart release — re-annotate before upgrading the parent chart so Helm doesn't refuse the import. Bumps chart to 1.3.4. Pair with tracebloc/ingestor 0.2.0, which flips `serviceAccount.create` default to `false` so subchart releases stop trying to own the SA themselves. Co-authored-by: Claude Opus 4.7 (1M context) --- client/Chart.yaml | 4 +- client/MIGRATION.md | 70 +++++++++++++++++++ client/templates/_helpers.tpl | 14 ++++ .../templates/ingestion-authz-configmap.yaml | 8 ++- client/templates/ingestor-serviceaccount.yaml | 18 +++++ .../tests/ingestor_serviceaccount_test.yaml | 45 ++++++++++++ client/values.yaml | 24 ++++--- 7 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 client/templates/ingestor-serviceaccount.yaml create mode 100644 client/tests/ingestor_serviceaccount_test.yaml diff --git a/client/Chart.yaml b/client/Chart.yaml index cd51c83..3f49598 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.3.3 -appVersion: "1.3.3" +version: 1.3.4 +appVersion: "1.3.4" keywords: - tracebloc - kubernetes diff --git a/client/MIGRATION.md b/client/MIGRATION.md index c507eb6..4d43c72 100644 --- a/client/MIGRATION.md +++ b/client/MIGRATION.md @@ -2,6 +2,76 @@ This guide explains how to migrate from the legacy per-platform charts (`aks/`, `bm/`, `eks/`, `oc/`) to the unified `client/` chart. +## Upgrading to 1.3.4 — parent chart owns the shared ingestor ServiceAccount + +[#129](https://github.com/tracebloc/client/issues/129): the ingestor +ServiceAccount has moved from the `tracebloc/ingestor` subchart into this +parent chart. Background: the SA is shared by every ingestor subchart +release in a namespace, but per-release Helm ownership meant two concurrent +`helm install tracebloc/ingestor` calls collided with "cannot import into +current release", and uninstalling the first release ripped the SA out +from under all the others. With the SA in the parent chart, every +ingestor release in the namespace shares it cleanly and `helm uninstall` +of any individual ingestor release leaves it alone. + +> **The matching ingestor subchart change** ships as +> `tracebloc/ingestor` **0.2.0** — `serviceAccount.create` default +> flipped from `true` to `false`. Upgrade the subchart releases in +> lockstep with the parent so they stop trying to own the SA. + +### When you need to adopt an existing SA + +If you already have a `tracebloc/ingestor` 0.1.0 release installed in the +same namespace as this `tracebloc/client` release, `kubectl get sa +ingestor -n -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}'` +returns that subchart release's name. Plain `helm upgrade tracebloc/client` +to 1.3.4 will fail with `Unable to continue with update: ServiceAccount +"ingestor" ... exists and cannot be imported into the current release`. + +Transfer Helm ownership before upgrading: + +```bash +# 1. Identify the values you need. +NAMESPACE= +CLIENT_RELEASE= # e.g. "tracebloc" +SA_NAME=ingestor # or ingestionAuthz.serviceAccountName if overridden + +# 2. Re-annotate the SA so Helm sees the parent client release as its owner. +kubectl annotate sa "$SA_NAME" -n "$NAMESPACE" \ + meta.helm.sh/release-name="$CLIENT_RELEASE" \ + meta.helm.sh/release-namespace="$NAMESPACE" \ + --overwrite + +kubectl label sa "$SA_NAME" -n "$NAMESPACE" \ + app.kubernetes.io/managed-by=Helm \ + --overwrite + +# 3. Now run the upgrade — Helm adopts the SA on next reconcile. +helm upgrade "$CLIENT_RELEASE" tracebloc/client \ + -n "$NAMESPACE" --version 1.3.4 --reset-then-reuse-values + +# 4. Upgrade each ingestor subchart release to 0.2.0 so it stops trying +# to create the SA itself. The flipped default does this for you, but +# use --reset-then-reuse-values so pre-0.2.0 stored values don't +# re-apply serviceAccount.create=true. +helm upgrade tracebloc/ingestor \ + -n "$NAMESPACE" --version 0.2.0 --reset-then-reuse-values +``` + +If no ingestor 0.1.0 release exists in the namespace yet, you don't have +to do anything — the parent chart creates the SA on first install of +1.3.4 and subsequent ingestor 0.2.0 releases consume it. + +### `--reuse-values` upgrade path + +Operators using plain `--reuse-values` (or the auto-upgrade cronjob +prior to 1.3.0, which used that flag) won't get the new +`ingestionAuthz.serviceAccountName` default. The chart's template +defaults the value to `"ingestor"` when absent, so the SA is created +with the expected name and existing `allowed` entries keep matching. +No template-level breakage; this is the same nil-guard pattern as +[#124](https://github.com/tracebloc/client/pull/124). + ## Upgrading to 1.3.0 — self-upgrade CronJob lands on by default Releases of 1.3.0+ install a `-auto-upgrade` CronJob that polls diff --git a/client/templates/_helpers.tpl b/client/templates/_helpers.tpl index 6515869..bb22b8b 100644 --- a/client/templates/_helpers.tpl +++ b/client/templates/_helpers.tpl @@ -25,6 +25,20 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{ .Release.Name }}-jobs-manager {{- end }} +{{/* + Name of the shared ServiceAccount the parent chart creates for ingestor + subchart releases. Single source of truth — used by: + - templates/ingestor-serviceaccount.yaml (creates the SA) + - templates/ingestion-authz-configmap.yaml (default authz entry) + The ingestor subchart's post-install hook runs as this SA; jobs-manager + validates its token via TokenReview against `ingestionAuthz.allowed`. + Nil-guarded: pre-#129 stored values from `--reuse-values` upgrades won't + have `ingestionAuthz.serviceAccountName`, so default to "ingestor". +*/}} +{{- define "tracebloc.ingestorServiceAccountName" -}} +{{- (default dict .Values.ingestionAuthz).serviceAccountName | default "ingestor" -}} +{{- end }} + {{/* Release-scoped name for the resource-monitor DaemonSet, ServiceAccount, ClusterRoleBinding subject, and selector/pod labels. Multiple releases diff --git a/client/templates/ingestion-authz-configmap.yaml b/client/templates/ingestion-authz-configmap.yaml index f8a1e74..24b02a3 100644 --- a/client/templates/ingestion-authz-configmap.yaml +++ b/client/templates/ingestion-authz-configmap.yaml @@ -26,9 +26,15 @@ data: missing child to an empty list, which renders as `allowed: []` — fail-safe (the authz policy then denies every caller, which is correct: there's no policy until the operator sets one). + + Per-entry `service_account` is nil-guarded to fall back to + `ingestionAuthz.serviceAccountName` (#129) so the default + values.yaml entry doesn't need to repeat the SA name — change + the name in one place and both the SA template + this policy + pick it up. */ -}} {{- range default list (default dict .Values.ingestionAuthz).allowed }} - - service_account: {{ .service_account | quote }} + - service_account: {{ .service_account | default (include "tracebloc.ingestorServiceAccountName" $) | quote }} namespace: {{ .namespace | default $.Release.Namespace | quote }} table_prefixes: {{- range .table_prefixes }} diff --git a/client/templates/ingestor-serviceaccount.yaml b/client/templates/ingestor-serviceaccount.yaml new file mode 100644 index 0000000..284b474 --- /dev/null +++ b/client/templates/ingestor-serviceaccount.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + # Shared ServiceAccount that every ingestor subchart release in this + # namespace runs its post-install hook as. Owned by the parent chart + # (not the subchart) because: + # - multiple concurrent ingestor releases need to share it — per-release + # ownership made the second `helm install tracebloc/ingestor` fail + # with Helm's "cannot import into current release" error (#129); + # - the matching `ingestionAuthz` ConfigMap is already owned here, so + # the SA and the policy referencing it have the same lifecycle. + # + # The subchart's `serviceAccount.create` defaults to `false` as of + # ingestor 0.2.0 — it consumes this SA rather than creating its own. + name: {{ include "tracebloc.ingestorServiceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} diff --git a/client/tests/ingestor_serviceaccount_test.yaml b/client/tests/ingestor_serviceaccount_test.yaml new file mode 100644 index 0000000..79319fe --- /dev/null +++ b/client/tests/ingestor_serviceaccount_test.yaml @@ -0,0 +1,45 @@ +suite: Ingestor ServiceAccount +templates: + - templates/ingestor-serviceaccount.yaml +set: + clientId: "test-id" + clientPassword: "test" +tests: + - it: renders the shared ingestor SA with the default name + asserts: + - isKind: + of: ServiceAccount + - equal: + path: metadata.name + value: ingestor + - equal: + path: metadata.namespace + value: NAMESPACE + + - it: respects an overridden ingestionAuthz.serviceAccountName + set: + ingestionAuthz: + serviceAccountName: my-ingestor + asserts: + - equal: + path: metadata.name + value: my-ingestor + + - it: nil-guards when ingestionAuthz is absent from --reuse-values + # Simulates a pre-#129 stored values dict — the helper falls back to + # the literal default "ingestor" rather than panicking on a nil access. + set: + ingestionAuthz: null + asserts: + - equal: + path: metadata.name + value: ingestor + + - it: stamps the standard tracebloc labels + asserts: + - equal: + path: metadata.labels["app.kubernetes.io/name"] + value: client + - equal: + path: metadata.labels["app.kubernetes.io/managed-by"] + value: Helm diff --git a/client/values.yaml b/client/values.yaml index 0c40385..3916c8f 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -365,8 +365,9 @@ autoUpgrade: # is (namespace, service_account) → list of allowed table-name prefixes; # "*" means any table. # -# Default: allow the ingestor subchart's SA (named after that release) to -# ingest into any table. Customers tighten via overrides — e.g.: +# Default: allow the ingestor subchart's SA (created below by this chart, +# named `ingestionAuthz.serviceAccountName`) to ingest into any table. +# Customers tighten via overrides — e.g.: # # ingestionAuthz: # allowed: @@ -374,11 +375,18 @@ autoUpgrade: # namespace: tracebloc # table_prefixes: ["chest_xrays_", "tumors_"] ingestionAuthz: + # -- Name of the shared ServiceAccount this chart creates for ingestor + # subchart releases (client#129). Single source of truth: the SA + # template and the default `allowed` entry below both read this value. + # Override only if the name "ingestor" collides with something in your + # namespace; the matching ingestor subchart's `serviceAccount.name` + # must be updated to the same value. + serviceAccountName: ingestor allowed: - # Default: the ingestor subchart's SA (named "ingestor") in the same - # namespace as the tracebloc release can ingest into any table. The - # `namespace` field is optional; when omitted the ConfigMap template + # Default: the SA named `ingestionAuthz.serviceAccountName` above, in + # the same namespace as this release, can ingest into any table. When + # `service_account` is omitted the ConfigMap template substitutes + # `ingestionAuthz.serviceAccountName`; when `namespace` is omitted it # substitutes `.Release.Namespace`. Customers tighten by adding more - # specific entries with explicit `namespace` and `table_prefixes`. - - service_account: ingestor - table_prefixes: ["*"] + # specific entries with explicit fields. + - table_prefixes: ["*"] From 9d556bbabed5d3f5af3c5c9acc38045dcd1702f5 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Tue, 19 May 2026 19:33:45 +0500 Subject: [PATCH 19/24] fix(#130): default idempotency key to install-time stamp, not release revision (#132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ingestor.idempotencyKey` previously fell back to `-` when `.Values.idempotencyKey` was unset. Helm restarts revisions at 1 after `helm uninstall`, so reinstalling under the same release name produced the same key. If anything dedupe-relevant changed in between (image digest is the dominant case during testing), jobs-manager correctly rejected the second submission with a 409 — but to a customer following the README it looked like the chart was broken. Default to `-` instead. Each install gets a fresh key; the opt-in stable-UUID path remains for callers who actually want at-most-once semantics across reinstalls. Note on the printf format: Sprig's `unixEpoch` returns a string (not an int), so the formatter is `%s-%s`, not `%s-%d`. Bumps ingestor subchart 0.1.0 → 0.1.1 (default-behavior change). Co-authored-by: Claude Opus 4.7 (1M context) --- ingestor/Chart.yaml | 2 +- ingestor/README.md | 2 +- ingestor/templates/_helpers.tpl | 12 ++++++++---- ingestor/values.yaml | 13 ++++++++----- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/ingestor/Chart.yaml b/ingestor/Chart.yaml index 87c37fc..0deb7fc 100644 --- a/ingestor/Chart.yaml +++ b/ingestor/Chart.yaml @@ -8,7 +8,7 @@ description: | the ingestor Job directly — this chart owns the config + hook artifacts, not the resulting Job or its data. type: application -version: 0.1.0 +version: 0.1.1 appVersion: "0.3.0-rc1" keywords: - tracebloc diff --git a/ingestor/README.md b/ingestor/README.md index 0524003..bfc8edd 100644 --- a/ingestor/README.md +++ b/ingestor/README.md @@ -61,7 +61,7 @@ When set, the digest must be the full canonical form (`sha256:` + 64 lowercase h | `jobsManager.endpoint` | `http://jobs-manager..svc.cluster.local:8080` (auto-resolved) | The ingestor release and the parent `tracebloc/client` release live in different namespaces, or you're testing against a port-forward. | | `serviceAccount.name` | `ingestor` | The cluster's `ingestionAuthz` policy expects a different SA name. (Default matches the parent chart's default.) | | `image.repository` | `ghcr.io/tracebloc/ingestor` | Air-gapped mirror. | -| `idempotencyKey` | `-` | You want strict at-most-once semantics across re-installs under the same release name. | +| `idempotencyKey` | `-` (regenerated every install) | You want strict at-most-once semantics across reinstalls of the same release name — pass a stable UUID so jobs-manager replays the original run instead of starting a new one. | | `hookTimeoutSeconds` | `30` | Slow networks or large schemas. | See `values.yaml` for the full set. diff --git a/ingestor/templates/_helpers.tpl b/ingestor/templates/_helpers.tpl index 1d1bcf8..c47062f 100644 --- a/ingestor/templates/_helpers.tpl +++ b/ingestor/templates/_helpers.tpl @@ -31,15 +31,19 @@ Naming: {{- end -}} {{- /* -Resolved idempotency key. Defaults to "-" so each -helm install / upgrade submits a fresh run; explicit override is -honored verbatim. +Resolved idempotency key. Defaults to "-" so each +install is a fresh run — including reinstalls under the same release +name, where Helm restarts revisions at 1 and a revision-derived key +would collide with the previous attempt and trip jobs-manager's +"already used with a different image_digest or table" guard. Explicit +override is honored verbatim; set it to a stable UUID only when you +want at-most-once semantics across reinstalls. */ -}} {{- define "ingestor.idempotencyKey" -}} {{- if .Values.idempotencyKey -}} {{ .Values.idempotencyKey }} {{- else -}} -{{ printf "%s-%d" .Release.Name .Release.Revision }} +{{ printf "%s-%s" .Release.Name (now | unixEpoch) }} {{- end -}} {{- end -}} diff --git a/ingestor/values.yaml b/ingestor/values.yaml index e5f8e12..545082c 100644 --- a/ingestor/values.yaml +++ b/ingestor/values.yaml @@ -79,11 +79,14 @@ hookResources: cpu: "100m" memory: "64Mi" -# -- Idempotency key used by jobs-manager to dedupe submissions. Derived -# from the Helm release name + revision by default so a `helm upgrade` -# under the same release submits a fresh run. Override to a stable UUID -# if you want strict at-most-once semantics across reinstalls of the -# same release name. +# -- Idempotency key used by jobs-manager to dedupe submissions. +# Defaults to `-` so each install is a fresh run — +# including reinstalls under the same release name. (A revision-derived +# default would collide on `helm uninstall && helm install`, because +# Helm resets revisions to 1 on the next install; jobs-manager would +# then reject the second attempt with a 409 if anything dedupe-relevant +# changed in between.) Override to a stable UUID only when you want +# at-most-once semantics across reinstalls of the same release name. idempotencyKey: "" # -- How long the post-install hook waits for the POST to return before From 05c2508588c734b2e1ccb76f73a5bacc0b10cc66 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 20 May 2026 15:58:39 +0500 Subject: [PATCH 20/24] feat(#129)!: default serviceAccount.create=false; parent chart owns the SA (#133) The ingestor SA is shared across every `tracebloc/ingestor` release in the namespace. The previous per-release ownership made the second concurrent install collide with Helm's "cannot import into current release" error, and uninstalling the first release deleted the SA out from under any sibling release that worked around the collision with `serviceAccount.create=false`. The parent `tracebloc/client` chart 1.3.4 now owns the SA, exposing its name via `ingestionAuthz.serviceAccountName`. This subchart's default flips to `create: false` so it consumes that shared SA. The `name` value is still required so the post-install hook Job knows which SA's token to mount. `serviceAccount.create=true` remains available as an escape hatch for operators on a pre-1.3.4 parent chart, with a comment in values.yaml explaining when (and only when) to flip it back on. Breaking change: bumps chart to 0.2.0. Pair with the 1.3.4 parent chart bump; see the parent's MIGRATION.md "Upgrading to 1.3.4" section for the SA-adoption procedure on clusters where a 0.1.0 release already created the SA. Co-authored-by: Claude Opus 4.7 (1M context) --- ingestor/Chart.yaml | 2 +- ingestor/README.md | 18 ++++++++++++++++-- ingestor/values.yaml | 21 ++++++++++++++++++--- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/ingestor/Chart.yaml b/ingestor/Chart.yaml index 0deb7fc..baf9fbd 100644 --- a/ingestor/Chart.yaml +++ b/ingestor/Chart.yaml @@ -8,7 +8,7 @@ description: | the ingestor Job directly — this chart owns the config + hook artifacts, not the resulting Job or its data. type: application -version: 0.1.1 +version: 0.2.0 appVersion: "0.3.0-rc1" keywords: - tracebloc diff --git a/ingestor/README.md b/ingestor/README.md index bfc8edd..aad7c40 100644 --- a/ingestor/README.md +++ b/ingestor/README.md @@ -10,13 +10,27 @@ helm install my-dataset tracebloc/ingestor \ **The ingestor image is managed centrally** by the tracebloc client chart's auto-upgrade flow — you don't need to pin a digest for each install. New ingestor releases roll out automatically when the cluster's daily auto-upgrade cronjob (`autoUpgrade.enabled: true` in the client chart) bumps the chart version. See [Pinning a specific image version](#pinning-a-specific-image-version) below for the override path. +## Prerequisites + +> **Install the `tracebloc/client` parent chart (1.3.4 or newer) into the +> target namespace before installing this chart.** The parent chart +> creates the `ingestor` ServiceAccount this chart's post-install hook +> runs as, and renders the `ingestionAuthz` ConfigMap that authorizes +> it. Without those preconditions the hook either has no SA to mount +> or fails authentication at jobs-manager. + +The SA is shared by every `tracebloc/ingestor` release in the namespace +— that's the point. Before 0.2.0 this chart created the SA itself, +which broke as soon as a second ingestor release tried to install +([tracebloc/client#129](https://github.com/tracebloc/client/issues/129)). + ## What this chart owns | Resource | Owner | Lifecycle | |---|---|---| | `ConfigMap/-config` (holds `ingest.yaml`) | this chart | created by `helm install`, deleted by `helm uninstall` | | `Job/-submit` (post-install hook that POSTs) | this chart | created post-install, removed before each `helm upgrade` | -| `ServiceAccount/` | this chart (optional, default true) | created by `helm install`, deleted by `helm uninstall` | +| `ServiceAccount/` | **parent `tracebloc/client` chart** (as of 0.2.0; this chart can still create it via `serviceAccount.create: true` when targeting a pre-1.3.4 parent) | tied to the parent client release | | `ConfigMap/ingest-config-` (per-run, mounted into the ingestor Pod) | **jobs-manager** | created by jobs-manager on accept; not managed by Helm | | `Secret/ingest-token-` (per-run, holds `BACKEND_TOKEN`) | **jobs-manager** | same | | `Job/ingest-job-` (the actual ingestor) | **jobs-manager** | same | @@ -85,7 +99,7 @@ kubectl -n tracebloc logs -l tracebloc.io/ingestion-run --tail=-1 helm uninstall my-dataset --namespace tracebloc ``` -Removes the chart's ConfigMap + hook Job + ServiceAccount. Does **not** remove the running ingestor Job, its outputs, or the metadata posted to the backend — those are owned by jobs-manager and the cluster respectively. +Removes the chart's ConfigMap + hook Job. The shared `ingestor` ServiceAccount is owned by the parent `tracebloc/client` release (as of 0.2.0) and stays put for other ingestor releases in the namespace. Does **not** remove the running ingestor Job, its outputs, or the metadata posted to the backend — those are owned by jobs-manager and the cluster respectively. To cancel an in-flight run, work with jobs-manager directly: diff --git a/ingestor/values.yaml b/ingestor/values.yaml index 545082c..91a2ab1 100644 --- a/ingestor/values.yaml +++ b/ingestor/values.yaml @@ -54,10 +54,25 @@ jobsManager: # -- ServiceAccount that the post-install hook runs as. Its token is the # credential jobs-manager validates via TokenReview, and the SA's name + # namespace are matched against the ingestionAuthz policy in the -# tracebloc client release. Default name matches the chart's default -# `ingestionAuthz.allowed[0].service_account: ingestor` entry. +# tracebloc client release. +# +# As of 0.2.0 (tracebloc/client#129) the SA is owned by the parent +# `tracebloc/client` chart (templated as `ingestor` by default, or +# whatever `ingestionAuthz.serviceAccountName` resolves to). The SA is +# shared by every ingestor subchart release in the namespace, so per- +# release ownership here collided with Helm on concurrent installs and +# made uninstalls rip the SA out from under sibling releases. +# +# Default: `create: false`. The chart consumes the existing SA created +# by the parent client release. The `name` value is still required so +# the post-install hook Job knows which SA's token to mount. +# +# Override `create: true` ONLY when running against a parent client +# chart older than 1.3.4 that didn't yet own the SA — in that case make +# sure no other ingestor release in the same namespace also has +# `create: true`, or you'll reproduce client#129. serviceAccount: - create: true + create: false name: ingestor # -- Image for the post-install hook Job itself (a thin curl wrapper). From df56d25efeda8cd876811fe96b66cc3777906c34 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 20 May 2026 16:04:27 +0500 Subject: [PATCH 21/24] chore(chart): bump ingestor digest to v0.3.0 + chart to 1.3.5 (#134) v0.3.0 is the first production-ready ingestor release (signed + SBOM), validated end-to-end against EKS on 2026-05-19 (6 files in PVC + 576 MySQL rows via the declarative chart path). The previous default (v0.3.0-rc1) had three real-cluster bugs that landed as tracebloc/data-ingestors#106: - #103 wheel + sdist were missing schema/ingest.v1.json - #104 image-resolution validator tuple-vs-list comparison - #105 _has_extension dot/case normalization (no more cat1.jpeg.jpeg) Chart bumped to 1.3.5 so the auto-upgrade cronjob (#69) detects the change and rolls customers onto v0.3.0 on the next tick. ingestor image: ghcr.io/tracebloc/ingestor@sha256:463e2367...07a4a cosign verify available; release notes contain the verification command. Co-authored-by: Claude Opus 4.7 (1M context) --- client/Chart.yaml | 4 ++-- client/values.yaml | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/client/Chart.yaml b/client/Chart.yaml index 3f49598..b2916af 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.3.4 -appVersion: "1.3.4" +version: 1.3.5 +appVersion: "1.3.5" keywords: - tracebloc - kubernetes diff --git a/client/values.yaml b/client/values.yaml index 3916c8f..df4903a 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -187,13 +187,15 @@ images: # subchart's `image.digest` (for pinning / debugging), but the # dominant path uses this value. # - # Initial value: ghcr.io/tracebloc/ingestor@. Bump - # this on each ingestor release; chart `version` in Chart.yaml must + # Bumped to v0.3.0 (2026-05-20) — first production-ready release of + # the declarative-YAML ingestor with schema-packaging, image-validator, + # and file-transfer fixes from tracebloc/data-ingestors#106. + # Bump this on each ingestor release; chart `version` in Chart.yaml must # also bump so the auto-upgrade cronjob detects the change. Future # automation in tracebloc/data-ingestors (release-image.yml) can # raise a PR to this file when a new image is published. ingestor: - digest: "sha256:e6639b084d0d377072dc908db376050914ebd49c730ddaa13f838d10f5482ea9" + digest: "sha256:463e236748708a5e3564569eec9173ea8cb3bcf515992d4939c5b610f3807a4a" podsMonitor: digest: "" resourceMonitor: From 373b27f88f890ef582bc03e7808068869eaeb9b8 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 20 May 2026 17:13:52 +0500 Subject: [PATCH 22/24] fix(#135): publish ingestor subchart alongside parent chart (#136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The customer-facing install path is helm repo add tracebloc https://tracebloc.github.io/client helm install my-dataset tracebloc/ingestor \ --namespace tracebloc-templates \ --set-file ingestConfig=./my.yaml For `tracebloc/ingestor` to resolve from that helm repo, the ingestor subchart must be packaged into gh-pages alongside the parent client chart. Before this PR, `release-helm-chart.yaml` only ran `helm package ./client`, so the second install path returned `Error: chart "ingestor" not found`. helm-ci.yaml also only lints the parent chart, so any future regression in `ingestor/templates/` would land on develop without CI noticing. Three changes: 1. release-helm-chart.yaml: package + index BOTH client and ingestor into a single shared index.yaml. Attach both tgzs to the GitHub release for download-by-tag pinning. 2. helm-ci.yaml: lint the ingestor subchart on every PR alongside the per-platform client lints. Plain `helm lint --strict ./ingestor` is enough — its only required value (ingestConfig) emits INFO not FAIL, and the chart's templates don't branch on platform so the per-platform values-file matrix doesn't apply. 3. ingestor/Chart.yaml: bump appVersion 0.3.0-rc1 → 0.3.0 to match the tracebloc/data-ingestors v0.3.0 release that just shipped. Chart version (0.2.0) is unchanged; appVersion is descriptive. Validated locally: both charts package cleanly (client-1.3.5.tgz, ingestor-0.2.0.tgz), all four platform-specific client lints pass, ingestor lint passes. Closes #135. Co-authored-by: Claude Opus 4.7 (1M context) --- .github/workflows/helm-ci.yaml | 17 +++++- .github/workflows/release-helm-chart.yaml | 69 +++++++++++++++-------- ingestor/Chart.yaml | 2 +- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/.github/workflows/helm-ci.yaml b/.github/workflows/helm-ci.yaml index c9f61c3..0ae78fc 100644 --- a/.github/workflows/helm-ci.yaml +++ b/.github/workflows/helm-ci.yaml @@ -5,11 +5,13 @@ on: branches: [main, openshift] paths: - 'client/**' + - 'ingestor/**' - '.github/workflows/helm-ci.yaml' pull_request: branches: [main, openshift] paths: - 'client/**' + - 'ingestor/**' - '.github/workflows/helm-ci.yaml' jobs: @@ -24,13 +26,24 @@ jobs: with: version: v3.15.4 - - name: Helm lint (strict) — all platforms + - name: Helm lint (strict) — parent client chart, all platforms run: | for f in client/ci/*-values.yaml; do - echo "=== Linting with $f ===" + echo "=== Linting client with $f ===" helm lint --strict ./client -f "$f" done + - name: Helm lint (strict) — ingestor subchart + # The ingestor subchart has no per-platform CI values files because + # its templates don't branch on platform — they only render a + # ConfigMap + Job + SA. The chart's one required value + # (`ingestConfig`) is supplied by customers at install time via + # `--set-file`; missing it during lint emits an INFO, not a FAIL, + # so plain `helm lint --strict ./ingestor` exercises the templates + # cleanly. See #135 for why we lint it here at all (publish workflow + # now packages both charts). + run: helm lint --strict ./ingestor + template: name: Template render runs-on: ubuntu-latest diff --git a/.github/workflows/release-helm-chart.yaml b/.github/workflows/release-helm-chart.yaml index 4bc015b..61b2f39 100644 --- a/.github/workflows/release-helm-chart.yaml +++ b/.github/workflows/release-helm-chart.yaml @@ -37,28 +37,44 @@ jobs: with: version: v3.15.4 - - name: Lint chart - # Use a CI values file so `--strict` sees non-empty clientId / - # clientPassword (the defaults in values.yaml are deliberately empty - # to force operators to supply real credentials, and the schema - # enforces minLength:1). Exhaustive multi-platform linting happens in - # helm-ci.yaml on every PR — here we just need the chart to lint - # cleanly for packaging. - run: helm lint --strict ./client -f client/ci/eks-values.yaml + - name: Lint charts + # The parent `client` chart needs a CI values file so `--strict` sees + # non-empty clientId / clientPassword (the defaults in values.yaml are + # deliberately empty to force operators to supply real credentials, and + # the schema enforces minLength:1). The `ingestor` subchart needs no + # values file — its only required value (ingestConfig) emits an INFO + # level, not FAIL, when missing, and customers always supply it via + # --set-file at install time. + # Exhaustive multi-platform linting happens in helm-ci.yaml on every + # PR — here we just need both charts to lint cleanly for packaging. + run: | + helm lint --strict ./client -f client/ci/eks-values.yaml + helm lint --strict ./ingestor - - name: Package tracebloc chart + - name: Package tracebloc charts id: package + # Package BOTH the parent client chart AND the ingestor subchart. + # Customers run two flavors of install: + # 1. `helm install tracebloc/client ...` — bootstraps the cluster. + # 2. `helm install tracebloc/ingestor ...` — per-dataset ingestion. + # Both must resolve from the same helm repo, so both .tgz files need + # to live alongside each other in gh-pages and be referenced in a + # single shared index.yaml. Earlier versions packaged only `./client` + # and the second install path failed with "chart not found" — see #135. run: | helm package ./client - TGZ=$(ls -t client-*.tgz | head -1) - echo "chart_tgz=$TGZ" >> $GITHUB_OUTPUT - echo "Packaged $TGZ" + helm package ./ingestor + echo "Packaged:" + ls -la *.tgz - - name: Upload chart artifact + - name: Upload chart artifacts uses: actions/upload-artifact@v4 with: - name: helm-chart - path: ${{ steps.package.outputs.chart_tgz }} + name: helm-charts + # Glob picks up both client-*.tgz and ingestor-*.tgz. + path: | + client-*.tgz + ingestor-*.tgz - name: Configure Git run: | @@ -70,7 +86,8 @@ jobs: run: | # Remove any local chart packages before switching branches to avoid # "untracked working tree files would be overwritten" checkout errors. - rm -f client-*.tgz || true + # Cover both chart name prefixes — see Package step above. + rm -f client-*.tgz ingestor-*.tgz || true if git fetch origin gh-pages 2>/dev/null; then git checkout gh-pages echo "index_exists=true" >> $GITHUB_OUTPUT @@ -80,37 +97,45 @@ jobs: echo "index_exists=false" >> $GITHUB_OUTPUT fi - - name: Download chart artifact + - name: Download chart artifacts uses: actions/download-artifact@v4 with: - name: helm-chart + name: helm-charts - name: Update index and push to gh-pages env: REPO_URL: https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }} run: | + # `helm repo index .` indexes every .tgz in the working dir, so both + # client-*.tgz and ingestor-*.tgz get listed in the shared + # index.yaml. Customers then resolve `tracebloc/client` and + # `tracebloc/ingestor` from the same repo URL. if [ "${{ steps.fetch.outputs.index_exists }}" = "true" ] && [ -f index.yaml ]; then helm repo index . --url "$REPO_URL" --merge index.yaml else helm repo index . --url "$REPO_URL" fi - git add index.yaml client-*.tgz + git add index.yaml client-*.tgz ingestor-*.tgz git status if git diff --staged --quiet; then echo "No index/tgz changes to commit" else - git commit -m "Release helm chart(s): $(ls client-*.tgz 2>/dev/null | tr '\n' ' ')" + git commit -m "Release helm chart(s): $(ls client-*.tgz ingestor-*.tgz 2>/dev/null | tr '\n' ' ')" git push origin gh-pages fi - - name: Upload chart to GitHub Release + - name: Upload charts to GitHub Release # Pin tag_name from the release event payload rather than relying on # github.ref / github.ref_name, which intermittently arrive empty on # release-triggered runs (actions/runner#2788, still open). + # Attach BOTH chart tgzs so downloaders pinning a specific release + # tag can pull either chart's exact bytes from this release page. uses: softprops/action-gh-release@v2 with: tag_name: ${{ github.event.release.tag_name }} - files: client-*.tgz + files: | + client-*.tgz + ingestor-*.tgz generate_release_notes: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/ingestor/Chart.yaml b/ingestor/Chart.yaml index baf9fbd..9ca3a34 100644 --- a/ingestor/Chart.yaml +++ b/ingestor/Chart.yaml @@ -9,7 +9,7 @@ description: | artifacts, not the resulting Job or its data. type: application version: 0.2.0 -appVersion: "0.3.0-rc1" +appVersion: "0.3.0" keywords: - tracebloc - kubernetes From e3c0e73c23f6ce9b6ce9ef2f867214a289a34163 Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 20 May 2026 17:24:05 +0500 Subject: [PATCH 23/24] docs(ingestor): explain image vs chart update lifecycle (#138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Customers ask: "the cluster has an auto-upgrade cronjob — does that mean my ingestor chart updates too?" The answer is nuanced: the image auto-updates (via INGESTOR_IMAGE_DIGEST on jobs-manager, kept current by the cronjob), but the chart on your workstation is independent — Helm's repo cache doesn't refresh itself. Add a "How updates work" section that explains the two-layer model and the strong property that the image you run is decoupled from the chart version that submitted the request. Plus an explicit FAQ on previously-installed ingestor releases (nothing to upgrade — fire-and-forget). No code change. Co-authored-by: Claude Opus 4.7 (1M context) --- ingestor/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ingestor/README.md b/ingestor/README.md index aad7c40..10e375d 100644 --- a/ingestor/README.md +++ b/ingestor/README.md @@ -50,6 +50,20 @@ which broke as soon as a second ingestor release tried to install The customer never builds an image. The customer never writes a Dockerfile. The customer writes ~8 lines of YAML. +## How updates work + +The ingestor has two independent update lifecycles, and customers usually only need to think about one. + +**Image: always current, automatically.** New `ghcr.io/tracebloc/ingestor` releases roll out to your cluster via the parent `tracebloc/client` chart's auto-upgrade cronjob (`autoUpgrade.enabled: true`, default). The cronjob runs `helm repo update` + `helm upgrade tracebloc/client` daily, which writes the new digest into the `INGESTOR_IMAGE_DIGEST` env on the running `tracebloc-jobs-manager` deployment. Your next `helm install tracebloc/ingestor ...` uses the new image automatically — no digest to pin, no version to track, no redeploy of anything you've already installed. + +**Chart: refresh your local cache before each install.** Helm's repo cache on _your workstation_ is independent of the cluster. The cluster's cronjob can refresh its own cache, but it cannot reach your laptop. Run `helm repo update` before each install to pick up new chart features (new values, new templates, new defaults). A stale cache still works — it just locks you out of chart-level options added since you last refreshed. **The image you run does not depend on the chart version**: jobs-manager picks the current `INGESTOR_IMAGE_DIGEST` regardless of which subchart version submitted the request. + +This stratification is intentional. The image picks up bugfixes and security patches without anyone restating their dataset configs; the chart only changes when there's a real protocol or UX shift. + +### What about previously-installed ingestor releases? + +Nothing to upgrade. The chart is fire-and-forget: each `helm install` POSTs once to jobs-manager, the ingestor Job runs to completion, and the chart artifacts (ConfigMap + completed hook Job) become inert. There's no controller to update, no deployment to roll, no scheduled work to bump. `helm upgrade ` would replay the same submission as a 200 no-op (the idempotency key was stamped at install time and is preserved under `--reuse-values`). + ## Required values | Value | Description | From 2fa37fcdc26eaeaf93fb9f176a50028c8e2567ec Mon Sep 17 00:00:00 2001 From: "Asad Iqbal (Saadi)" Date: Wed, 20 May 2026 18:35:50 +0500 Subject: [PATCH 24/24] Fix three bugbot findings from PR #137 review (#142) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(#139): preserve idempotency key across helm upgrade The ingestor.idempotencyKey helper defaulted to "-" and re-stamped on every render. `helm upgrade --reuse-values` preserves the stored value "" (not the previously-rendered key), so the template re-evaluated `now | unixEpoch` and produced a NEW key each upgrade — accidentally creating duplicate ingestion runs from what customers expected to be no-op upgrades. Contradicts the documented behavior in ingestor/README.md added in #138. Look up the existing post-install hook ConfigMap from the previous render and reuse its idempotency_key. On fresh install (or after uninstall) the lookup returns empty and we fall through to the now-based default. `helm template` (no cluster connection) returns empty for lookup too, so local previews still get a fresh key per render — matches the in-cluster install path the first time. Caught by bugbot on PR #137 review. Closes #139. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(#140): read requests-proxy resources from values The requests-proxy deployment hardcoded its container resources, ignoring the resources.requestsProxy schema entry that values.schema.json has defined since the requests-proxy was added. Every other component (jobsManager, podsMonitor, mysql) reads from .Values.resources..* with defaults — bring requestsProxy in line with that pattern. Adds the resources.requestsProxy block to values.yaml with the existing hardcoded defaults so behavior on a fresh install is unchanged. The template uses the default-through-dict nil-guard idiom so `helm upgrade --reuse-values` from a pre-1.3.6 release (where the value didn't exist) still renders cleanly without crashing on a nil parent. Caught by bugbot on PR #137 review. Closes #140. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(#141): add images.ingestor entry to values.schema.json values.yaml has had images.ingestor.digest since #126, and the jobs-manager template surfaces it as INGESTOR_IMAGE_DIGEST, but the schema didn't validate it — every other image (jobsManager, podsMonitor, resourceMonitor, requestsProxy, mysqlClient, busybox) has an entry. An operator setting --set images.ingestor.digest=foo (not the canonical sha256:<64-hex>) bypassed schema validation and failed only later inside submit_ingestion_run.py. Add the missing entry mirroring the other image entries' shape. helm template now rejects malformed digests at chart-template time ("values don't meet the specifications of the schema(s)... Does not match pattern '^(sha256:[a-f0-9]{64})?$'") rather than waiting for runtime. Caught by bugbot on PR #137 review. Closes #141. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .../templates/requests-proxy-deployment.yaml | 23 ++++++++-- client/values.schema.json | 10 +++++ client/values.yaml | 11 +++++ ingestor/templates/_helpers.tpl | 43 ++++++++++++++++--- 4 files changed, 76 insertions(+), 11 deletions(-) diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml index b0e2346..cde610c 100644 --- a/client/templates/requests-proxy-deployment.yaml +++ b/client/templates/requests-proxy-deployment.yaml @@ -48,13 +48,28 @@ spec: capabilities: drop: ["ALL"] readOnlyRootFilesystem: true + {{- /* Nil-guard the whole resources.requestsProxy chain. A + helm upgrade --reuse-values from a release predating this + field (any 1.3.x ≤ 1.3.5) won't have it in stored values, + and the direct path .Values.resources.requestsProxy... on + a nil parent dict crashes with "nil pointer evaluating + interface{}.requests". Default-through-dict idiom lets + missing values fall through to the inline literals. + The trailing dash on `{{-` strips the newline AFTER + readOnlyRootFilesystem; we deliberately do NOT use `-}}` + on the last `:=` so the newline before `resources:` is + preserved (an earlier version of this guard ate the + newline and rendered `readOnlyRootFilesystem: trueresources:`). */ -}} + {{- $rp := default dict (default dict .Values.resources).requestsProxy }} + {{- $rpReq := default dict $rp.requests }} + {{- $rpLim := default dict $rp.limits }} resources: requests: - cpu: 100m - memory: 256Mi + cpu: {{ $rpReq.cpu | default "100m" | quote }} + memory: {{ $rpReq.memory | default "256Mi" | quote }} limits: - cpu: 1000m - memory: 512Mi + cpu: {{ $rpLim.cpu | default "1000m" | quote }} + memory: {{ $rpLim.memory | default "512Mi" | quote }} env: - name: MYSQL_HOST value: "mysql-client" diff --git a/client/values.schema.json b/client/values.schema.json index 5d2a835..d2efd22 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -293,6 +293,16 @@ } } }, + "ingestor": { + "type": "object", + "properties": { + "digest": { + "type": "string", + "pattern": "^(sha256:[a-f0-9]{64})?$", + "description": "Optional canonical ghcr.io/tracebloc/ingestor digest (sha256:<64 hex>). Surfaced into jobs-manager as the INGESTOR_IMAGE_DIGEST env so the customer-facing tracebloc/ingestor subchart can pick it up without per-install overrides. Empty disables the default and forces customers to --set image.digest on each ingestor subchart install." + } + } + }, "mysqlClient": { "type": "object", "properties": { diff --git a/client/values.yaml b/client/values.yaml index df4903a..28de382 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -249,6 +249,17 @@ resources: limits: cpu: "500m" memory: "512Mi" + # requests-proxy serves the Service Bus / backend communication path and + # is mostly idle (a few req/min). 100m/256Mi requests with headroom for + # the occasional burst; revisit if heap profiling shows growth. Schema + # at values.schema.json#resources.requestsProxy validates overrides. + requestsProxy: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "512Mi" # -- PriorityClass for the data-plane (mysql). # Cluster-scoped resource. Created with helm.sh/resource-policy: keep so a diff --git a/ingestor/templates/_helpers.tpl b/ingestor/templates/_helpers.tpl index c47062f..5a8edf7 100644 --- a/ingestor/templates/_helpers.tpl +++ b/ingestor/templates/_helpers.tpl @@ -31,21 +31,50 @@ Naming: {{- end -}} {{- /* -Resolved idempotency key. Defaults to "-" so each -install is a fresh run — including reinstalls under the same release -name, where Helm restarts revisions at 1 and a revision-derived key -would collide with the previous attempt and trip jobs-manager's -"already used with a different image_digest or table" guard. Explicit -override is honored verbatim; set it to a stable UUID only when you -want at-most-once semantics across reinstalls. +Resolved idempotency key. + +Default behavior: + - First helm install: stamp a fresh "-" key. + - helm upgrade of the same release: REUSE the existing key by looking + up the post-install hook ConfigMap from the previous render. This + preserves replay semantics — jobs-manager sees the same key on + upgrade and returns 200 (replay) rather than spawning a new run. + - helm install after uninstall: lookup misses (ConfigMap was deleted + on uninstall), so we fall through to a fresh now-based key. No + collision with the previous run because the epoch differs. + +Earlier versions defaulted to `now | unixEpoch` on every render. That +worked for installs but accidentally created a NEW key on +`helm upgrade --reuse-values` (Helm preserves the stored value `""`, +not the previously-rendered key, so the template re-evaluates `now`). +The result: customers running `helm upgrade` thinking it was a no-op +got duplicate ingestion runs. Bugbot caught it on PR #137. See #139. + +Helm template (no cluster connection) returns empty for lookup, so +local previews always re-stamp with a fresh key — matches the +in-cluster install path the first time around. + +Explicit override is honored verbatim; set `idempotencyKey` to a +stable UUID when you want strict at-most-once semantics across +uninstall/reinstall cycles. */ -}} {{- define "ingestor.idempotencyKey" -}} {{- if .Values.idempotencyKey -}} {{ .Values.idempotencyKey }} {{- else -}} +{{- $existing := lookup "v1" "ConfigMap" .Release.Namespace (include "ingestor.configMapName" .) -}} +{{- /* The ConfigMap key is literally "body.json" (a single key with a dot in + its name, not a nested path), so use `index` rather than dot-access. + The fromJson call then parses the JSON body and we read its + idempotency_key field. Guards against missing data map (e.g. an + in-flight create) by defaulting through `dict`. */ -}} +{{- if and $existing (hasKey ($existing.data | default dict) "body.json") -}} +{{- (fromJson (index $existing.data "body.json")).idempotency_key -}} +{{- else -}} {{ printf "%s-%s" .Release.Name (now | unixEpoch) }} {{- end -}} {{- end -}} +{{- end -}} {{- define "ingestor.labels" -}} app.kubernetes.io/name: ingestor