Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 55 additions & 1 deletion client/templates/image-refresh-cronjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ data:
log " client images: tracebloc/{jobs-manager,pods-monitor} on docker.io under tag=$IMAGE_TAG"
log " ingestor image: tracebloc/ingestor on ghcr.io under tag=$INGESTOR_TAG (pinned=$INGESTOR_PINNED)"

# Consecutive failed ghcr ingestor-digest resolves before image-refresh
# escalates from WARN+skip to a failed Job (#186 #2). The chart injects
# this from imageRefresh.ingestorResolveFailureThreshold; default here
# too so a hand-edited CronJob missing the env var still runs under -u.
: "${INGESTOR_RESOLVE_FAIL_THRESHOLD:=3}"

# Get an anonymous pull-scope token for one repository on a
# registry. Both docker.io and ghcr.io support anonymous tokens for
# public images; only the issuer URL differs. tr cleanup handles
Expand Down Expand Up @@ -284,9 +290,50 @@ data:
log " ingestor.autoRefresh: false in values; skipping (operator opted into pinning)"
else
latest_ingestor="$(get_latest_digest "tracebloc/ingestor" "$INGESTOR_TAG" "ghcr.io" || true)"
ingestor_fail_key="tracebloc.io/ingestor-refresh-consecutive-failures"
ingestor_err_key="tracebloc.io/ingestor-refresh-last-error"
if [ -z "$latest_ingestor" ]; then
log " WARN: could not resolve latest digest (rate-limited or transient); skipping this tick"
# Could not resolve the ingestor digest from ghcr.io. A transient
# blip (rate-limit, momentary DNS) and a PERSISTENT failure (egress
# to ghcr.io firewalled, the ghcr token endpoint blocked, a proxy
# that allowlists docker.io but not ghcr.io) are indistinguishable
# on a single tick — so COUNT consecutive failures and escalate
# once they cross the threshold, instead of skipping silently
# forever. #186 (#2): berlin-team sat on the amd64-only baseline
# because every ingestor tick hit this branch while the docker.io
# images (jobs-manager, pods-monitor) refreshed fine — the CronJob
# looked healthy and nothing surfaced the ghcr failure.
prev_fails="$(get_annotation "$ingestor_fail_key" || true)"
case "$prev_fails" in ''|*[!0-9]*) prev_fails=0 ;; esac
if [ "$prev_fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then
# Already escalated on an earlier tick; stay loud (fail the Job)
# without inflating the counter — it caps at the threshold.
log " ERROR: still cannot resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} (>= ${INGESTOR_RESOLVE_FAIL_THRESHOLD} consecutive failures). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image. Check egress to ghcr.io and its token endpoint from this namespace. See client#186."
exit 1
fi
fails=$((prev_fails + 1))
kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
"${ingestor_fail_key}=${fails}" --overwrite
if [ "$fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then
# Persistent. Record a human-readable last-error and fail the Job
# so it surfaces in `kubectl get cronjob` and monitoring — the
# same "failed Job = operator-visible" idiom Pass 2's stuck-
# rollout check already relies on.
kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
"${ingestor_err_key}=could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks" --overwrite
log " ERROR: could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks (threshold ${INGESTOR_RESOLVE_FAIL_THRESHOLD}). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image (e.g. an amd64-only baseline on arm64 nodes). Check egress to ghcr.io and its token endpoint from this namespace. Failing the Job so this surfaces in monitoring. See client#186."
exit 1
fi
log " WARN: could not resolve latest ingestor digest (failure ${fails}/${INGESTOR_RESOLVE_FAIL_THRESHOLD}; transient?); skipping this tick"
else
# Resolved OK. Clear any prior failure streak so a recovered
# registry/egress blip doesn't leave a stale failure annotation
# (and a future failure starts counting from zero again).
if [ -n "$(get_annotation "$ingestor_fail_key" || true)" ]; then
log " ingestor digest resolved; clearing prior failure streak"
kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
"${ingestor_fail_key}-" "${ingestor_err_key}-" >/dev/null 2>&1 || true
fi
recorded_ingestor="$(get_annotation "tracebloc.io/last-refreshed-ingestor-digest" || true)"
# Always read spec env too — the annotation alone isn't enough
# because external actors can revert the spec without touching
Expand Down Expand Up @@ -484,6 +531,13 @@ spec:
# semver-style float tags. Caught in PR #162 review.
- name: INGESTOR_TAG
value: {{ (default dict (default dict .Values.images).ingestor).tag | default "0.3" | quote }}
# Consecutive failed ghcr ingestor-digest resolves before
# image-refresh stops silently skipping and fails the Job
# loudly (#186 #2). nil-guarded with a `default 3` fallback
# so --reuse-values upgrades from pre-this-PR stored
# manifests (which lack the key) still render.
- name: INGESTOR_RESOLVE_FAIL_THRESHOLD
value: {{ (default dict .Values.imageRefresh).ingestorResolveFailureThreshold | default 3 | quote }}
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
Expand Down
50 changes: 50 additions & 0 deletions client/tests/image_refresh_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,56 @@ tests:
name: INGESTOR_TAG
value: staging

- it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD defaults to "3" (#186 #2)
# Below this many consecutive ghcr resolve failures, image-refresh
# WARNs + skips; at/above it the script fails the Job loudly instead
# of silently leaving jobs-manager on a stale digest.
template: templates/image-refresh-cronjob.yaml
documentIndex: 1
asserts:
- contains:
path: spec.jobTemplate.spec.template.spec.containers[0].env
content:
name: INGESTOR_RESOLVE_FAIL_THRESHOLD
value: "3"

- it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD falls back to "3" when imageRefresh key is absent
# --reuse-values upgrade from a pre-this-PR stored manifest lacks the
# key; the template's `| default 3` must still render a usable value.
template: templates/image-refresh-cronjob.yaml
documentIndex: 1
set:
imageRefresh:
ingestorResolveFailureThreshold: null
asserts:
- contains:
path: spec.jobTemplate.spec.template.spec.containers[0].env
content:
name: INGESTOR_RESOLVE_FAIL_THRESHOLD
value: "3"

- it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD is overridable
template: templates/image-refresh-cronjob.yaml
documentIndex: 1
set:
imageRefresh:
ingestorResolveFailureThreshold: 5
asserts:
- contains:
path: spec.jobTemplate.spec.template.spec.containers[0].env
content:
name: INGESTOR_RESOLVE_FAIL_THRESHOLD
value: "5"

- it: schema rejects ingestorResolveFailureThreshold below 1
template: templates/image-refresh-cronjob.yaml
set:
imageRefresh:
ingestorResolveFailureThreshold: 0
asserts:
- failedTemplate:
errorPattern: "must not be valid against schema|Must be greater than or equal to 1"

- it: schema rejects ingestor.tag=latest
template: templates/image-refresh-cronjob.yaml
set:
Expand Down
6 changes: 6 additions & 0 deletions client/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,12 @@
"pattern": "^[0-9]+(s|m|h)$",
"description": "Passed to `kubectl rollout status --timeout`. Allow headroom for bare-metal image pull."
},
"ingestorResolveFailureThreshold": {
"type": "integer",
"minimum": 1,
"default": 3,
"description": "Consecutive failed ghcr.io ingestor-digest resolves before image-refresh fails the Job loudly instead of silently skipping. See #186 (#2). Higher tolerates flakier egress; very high keeps the old skip-forever behaviour."
},
"suspend": {
"type": "boolean",
"default": false,
Expand Down
11 changes: 11 additions & 0 deletions client/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,17 @@ imageRefresh:
# on bare-metal first-boot can take minutes, and the CronJob slot
# shouldn't hold all night if a rollout genuinely wedges.
rolloutTimeout: "10m"
# Consecutive failed ghcr.io ingestor-digest resolves before image-refresh
# stops silently skipping and fails the Job loudly (visible in
# `kubectl get cronjob` / monitoring, plus a `tracebloc.io/ingestor-refresh-
# last-error` annotation on the jobs-manager deployment). Default 3 ≈ 45 min
# at the 15-min schedule: long enough to ride out a transient registry blip,
# short enough to surface a persistent egress/proxy failure (e.g. a cluster
# that reaches docker.io but not ghcr.io — the #186 (#2) root cause). A
# failed resolve BELOW this count still just WARNs and skips the tick.
# Raise it for flakier egress; set it very high to keep the old
# skip-forever behaviour. Only affects the ingestor (ghcr) image.
ingestorResolveFailureThreshold: 3
# CronJob spec knobs. Override per-cluster if needed.
suspend: false
# Lower history limits than autoUpgrade — this Job runs ~96x/day, the
Expand Down
Loading