diff --git a/client/templates/image-refresh-cronjob.yaml b/client/templates/image-refresh-cronjob.yaml index ee92ed5..af4e2e3 100644 --- a/client/templates/image-refresh-cronjob.yaml +++ b/client/templates/image-refresh-cronjob.yaml @@ -83,6 +83,12 @@ data: log " client images: tracebloc/{jobs-manager,pods-monitor} on docker.io under tag=$IMAGE_TAG" log " ingestor image: tracebloc/ingestor on ghcr.io under tag=$INGESTOR_TAG (pinned=$INGESTOR_PINNED)" + # Consecutive failed ghcr ingestor-digest resolves before image-refresh + # escalates from WARN+skip to a failed Job (#186 #2). The chart injects + # this from imageRefresh.ingestorResolveFailureThreshold; default here + # too so a hand-edited CronJob missing the env var still runs under -u. + : "${INGESTOR_RESOLVE_FAIL_THRESHOLD:=3}" + # Get an anonymous pull-scope token for one repository on a # registry. Both docker.io and ghcr.io support anonymous tokens for # public images; only the issuer URL differs. tr cleanup handles @@ -284,9 +290,50 @@ data: log " ingestor.autoRefresh: false in values; skipping (operator opted into pinning)" else latest_ingestor="$(get_latest_digest "tracebloc/ingestor" "$INGESTOR_TAG" "ghcr.io" || true)" + ingestor_fail_key="tracebloc.io/ingestor-refresh-consecutive-failures" + ingestor_err_key="tracebloc.io/ingestor-refresh-last-error" if [ -z "$latest_ingestor" ]; then - log " WARN: could not resolve latest digest (rate-limited or transient); skipping this tick" + # Could not resolve the ingestor digest from ghcr.io. A transient + # blip (rate-limit, momentary DNS) and a PERSISTENT failure (egress + # to ghcr.io firewalled, the ghcr token endpoint blocked, a proxy + # that allowlists docker.io but not ghcr.io) are indistinguishable + # on a single tick — so COUNT consecutive failures and escalate + # once they cross the threshold, instead of skipping silently + # forever. #186 (#2): berlin-team sat on the amd64-only baseline + # because every ingestor tick hit this branch while the docker.io + # images (jobs-manager, pods-monitor) refreshed fine — the CronJob + # looked healthy and nothing surfaced the ghcr failure. + prev_fails="$(get_annotation "$ingestor_fail_key" || true)" + case "$prev_fails" in ''|*[!0-9]*) prev_fails=0 ;; esac + if [ "$prev_fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then + # Already escalated on an earlier tick; stay loud (fail the Job) + # without inflating the counter — it caps at the threshold. + log " ERROR: still cannot resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} (>= ${INGESTOR_RESOLVE_FAIL_THRESHOLD} consecutive failures). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image. Check egress to ghcr.io and its token endpoint from this namespace. See client#186." + exit 1 + fi + fails=$((prev_fails + 1)) + kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \ + "${ingestor_fail_key}=${fails}" --overwrite + if [ "$fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then + # Persistent. Record a human-readable last-error and fail the Job + # so it surfaces in `kubectl get cronjob` and monitoring — the + # same "failed Job = operator-visible" idiom Pass 2's stuck- + # rollout check already relies on. + kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \ + "${ingestor_err_key}=could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks" --overwrite + log " ERROR: could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks (threshold ${INGESTOR_RESOLVE_FAIL_THRESHOLD}). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image (e.g. an amd64-only baseline on arm64 nodes). Check egress to ghcr.io and its token endpoint from this namespace. Failing the Job so this surfaces in monitoring. See client#186." + exit 1 + fi + log " WARN: could not resolve latest ingestor digest (failure ${fails}/${INGESTOR_RESOLVE_FAIL_THRESHOLD}; transient?); skipping this tick" else + # Resolved OK. Clear any prior failure streak so a recovered + # registry/egress blip doesn't leave a stale failure annotation + # (and a future failure starts counting from zero again). + if [ -n "$(get_annotation "$ingestor_fail_key" || true)" ]; then + log " ingestor digest resolved; clearing prior failure streak" + kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \ + "${ingestor_fail_key}-" "${ingestor_err_key}-" >/dev/null 2>&1 || true + fi recorded_ingestor="$(get_annotation "tracebloc.io/last-refreshed-ingestor-digest" || true)" # Always read spec env too — the annotation alone isn't enough # because external actors can revert the spec without touching @@ -484,6 +531,13 @@ spec: # semver-style float tags. Caught in PR #162 review. - name: INGESTOR_TAG value: {{ (default dict (default dict .Values.images).ingestor).tag | default "0.3" | quote }} + # Consecutive failed ghcr ingestor-digest resolves before + # image-refresh stops silently skipping and fails the Job + # loudly (#186 #2). nil-guarded with a `default 3` fallback + # so --reuse-values upgrades from pre-this-PR stored + # manifests (which lack the key) still render. + - name: INGESTOR_RESOLVE_FAIL_THRESHOLD + value: {{ (default dict .Values.imageRefresh).ingestorResolveFailureThreshold | default 3 | quote }} securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true diff --git a/client/tests/image_refresh_test.yaml b/client/tests/image_refresh_test.yaml index a13c665..42f458d 100644 --- a/client/tests/image_refresh_test.yaml +++ b/client/tests/image_refresh_test.yaml @@ -463,6 +463,56 @@ tests: name: INGESTOR_TAG value: staging + - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD defaults to "3" (#186 #2) + # Below this many consecutive ghcr resolve failures, image-refresh + # WARNs + skips; at/above it the script fails the Job loudly instead + # of silently leaving jobs-manager on a stale digest. + template: templates/image-refresh-cronjob.yaml + documentIndex: 1 + asserts: + - contains: + path: spec.jobTemplate.spec.template.spec.containers[0].env + content: + name: INGESTOR_RESOLVE_FAIL_THRESHOLD + value: "3" + + - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD falls back to "3" when imageRefresh key is absent + # --reuse-values upgrade from a pre-this-PR stored manifest lacks the + # key; the template's `| default 3` must still render a usable value. + template: templates/image-refresh-cronjob.yaml + documentIndex: 1 + set: + imageRefresh: + ingestorResolveFailureThreshold: null + asserts: + - contains: + path: spec.jobTemplate.spec.template.spec.containers[0].env + content: + name: INGESTOR_RESOLVE_FAIL_THRESHOLD + value: "3" + + - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD is overridable + template: templates/image-refresh-cronjob.yaml + documentIndex: 1 + set: + imageRefresh: + ingestorResolveFailureThreshold: 5 + asserts: + - contains: + path: spec.jobTemplate.spec.template.spec.containers[0].env + content: + name: INGESTOR_RESOLVE_FAIL_THRESHOLD + value: "5" + + - it: schema rejects ingestorResolveFailureThreshold below 1 + template: templates/image-refresh-cronjob.yaml + set: + imageRefresh: + ingestorResolveFailureThreshold: 0 + asserts: + - failedTemplate: + errorPattern: "must not be valid against schema|Must be greater than or equal to 1" + - it: schema rejects ingestor.tag=latest template: templates/image-refresh-cronjob.yaml set: diff --git a/client/values.schema.json b/client/values.schema.json index 937b363..9ba0564 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -585,6 +585,12 @@ "pattern": "^[0-9]+(s|m|h)$", "description": "Passed to `kubectl rollout status --timeout`. Allow headroom for bare-metal image pull." }, + "ingestorResolveFailureThreshold": { + "type": "integer", + "minimum": 1, + "default": 3, + "description": "Consecutive failed ghcr.io ingestor-digest resolves before image-refresh fails the Job loudly instead of silently skipping. See #186 (#2). Higher tolerates flakier egress; very high keeps the old skip-forever behaviour." + }, "suspend": { "type": "boolean", "default": false, diff --git a/client/values.yaml b/client/values.yaml index 4cf25c8..5e5eda9 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -489,6 +489,17 @@ imageRefresh: # on bare-metal first-boot can take minutes, and the CronJob slot # shouldn't hold all night if a rollout genuinely wedges. rolloutTimeout: "10m" + # Consecutive failed ghcr.io ingestor-digest resolves before image-refresh + # stops silently skipping and fails the Job loudly (visible in + # `kubectl get cronjob` / monitoring, plus a `tracebloc.io/ingestor-refresh- + # last-error` annotation on the jobs-manager deployment). Default 3 ≈ 45 min + # at the 15-min schedule: long enough to ride out a transient registry blip, + # short enough to surface a persistent egress/proxy failure (e.g. a cluster + # that reaches docker.io but not ghcr.io — the #186 (#2) root cause). A + # failed resolve BELOW this count still just WARNs and skips the tick. + # Raise it for flakier egress; set it very high to keep the old + # skip-forever behaviour. Only affects the ingestor (ghcr) image. + ingestorResolveFailureThreshold: 3 # CronJob spec knobs. Override per-cluster if needed. suspend: false # Lower history limits than autoUpgrade — this Job runs ~96x/day, the