diff --git a/client/templates/image-refresh-cronjob.yaml b/client/templates/image-refresh-cronjob.yaml
index ee92ed5..af4e2e3 100644
--- a/client/templates/image-refresh-cronjob.yaml
+++ b/client/templates/image-refresh-cronjob.yaml
@@ -83,6 +83,12 @@ data:
     log "  client images: tracebloc/{jobs-manager,pods-monitor} on docker.io under tag=$IMAGE_TAG"
     log "  ingestor image: tracebloc/ingestor on ghcr.io under tag=$INGESTOR_TAG (pinned=$INGESTOR_PINNED)"
 
+    # Consecutive failed ghcr ingestor-digest resolves before image-refresh
+    # escalates from WARN+skip to a failed Job (#186 #2). The chart injects
+    # this from imageRefresh.ingestorResolveFailureThreshold; default here
+    # too so a hand-edited CronJob missing the env var still runs under -u.
+    : "${INGESTOR_RESOLVE_FAIL_THRESHOLD:=3}"
+
     # Get an anonymous pull-scope token for one repository on a
     # registry. Both docker.io and ghcr.io support anonymous tokens for
     # public images; only the issuer URL differs. tr cleanup handles
@@ -284,9 +290,50 @@ data:
       log "  ingestor.autoRefresh: false in values; skipping (operator opted into pinning)"
     else
       latest_ingestor="$(get_latest_digest "tracebloc/ingestor" "$INGESTOR_TAG" "ghcr.io" || true)"
+      ingestor_fail_key="tracebloc.io/ingestor-refresh-consecutive-failures"
+      ingestor_err_key="tracebloc.io/ingestor-refresh-last-error"
       if [ -z "$latest_ingestor" ]; then
-        log "  WARN: could not resolve latest digest (rate-limited or transient); skipping this tick"
+        # Could not resolve the ingestor digest from ghcr.io. A transient
+        # blip (rate-limit, momentary DNS) and a PERSISTENT failure (egress
+        # to ghcr.io firewalled, the ghcr token endpoint blocked, a proxy
+        # that allowlists docker.io but not ghcr.io) are indistinguishable
+        # on a single tick — so COUNT consecutive failures and escalate
+        # once they cross the threshold, instead of skipping silently
+        # forever. #186 (#2): berlin-team sat on the amd64-only baseline
+        # because every ingestor tick hit this branch while the docker.io
+        # images (jobs-manager, pods-monitor) refreshed fine — the CronJob
+        # looked healthy and nothing surfaced the ghcr failure.
+        prev_fails="$(get_annotation "$ingestor_fail_key" || true)"
+        case "$prev_fails" in ''|*[!0-9]*) prev_fails=0 ;; esac
+        if [ "$prev_fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then
+          # Already escalated on an earlier tick; stay loud (fail the Job)
+          # without inflating the counter — it caps at the threshold.
+          log "  ERROR: still cannot resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} (>= ${INGESTOR_RESOLVE_FAIL_THRESHOLD} consecutive failures). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image. Check egress to ghcr.io and its token endpoint from this namespace. See client#186."
+          exit 1
+        fi
+        fails=$((prev_fails + 1))
+        kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
+          "${ingestor_fail_key}=${fails}" --overwrite
+        if [ "$fails" -ge "$INGESTOR_RESOLVE_FAIL_THRESHOLD" ]; then
+          # Persistent. Record a human-readable last-error and fail the Job
+          # so it surfaces in `kubectl get cronjob` and monitoring — the
+          # same "failed Job = operator-visible" idiom Pass 2's stuck-
+          # rollout check already relies on.
+          kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
+            "${ingestor_err_key}=could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks" --overwrite
+          log "  ERROR: could not resolve ghcr.io/tracebloc/ingestor:${INGESTOR_TAG} for ${fails} consecutive ticks (threshold ${INGESTOR_RESOLVE_FAIL_THRESHOLD}). Ingestor digest is NOT auto-refreshing; ingestion may be stuck on a stale image (e.g. an amd64-only baseline on arm64 nodes). Check egress to ghcr.io and its token endpoint from this namespace. Failing the Job so this surfaces in monitoring. See client#186."
+          exit 1
+        fi
+        log "  WARN: could not resolve latest ingestor digest (failure ${fails}/${INGESTOR_RESOLVE_FAIL_THRESHOLD}; transient?); skipping this tick"
       else
+        # Resolved OK. Clear any prior failure streak so a recovered
+        # registry/egress blip doesn't leave a stale failure annotation
+        # (and a future failure starts counting from zero again).
+        if [ -n "$(get_annotation "$ingestor_fail_key" || true)" ]; then
+          log "  ingestor digest resolved; clearing prior failure streak"
+          kubectl annotate deployment -n "$RELEASE_NAMESPACE" "$DEPLOYMENT_NAME" \
+            "${ingestor_fail_key}-" "${ingestor_err_key}-" >/dev/null 2>&1 || true
+        fi
         recorded_ingestor="$(get_annotation "tracebloc.io/last-refreshed-ingestor-digest" || true)"
         # Always read spec env too — the annotation alone isn't enough
         # because external actors can revert the spec without touching
@@ -484,6 +531,13 @@ spec:
                 # semver-style float tags. Caught in PR #162 review.
                 - name: INGESTOR_TAG
                   value: {{ (default dict (default dict .Values.images).ingestor).tag | default "0.3" | quote }}
+                # Consecutive failed ghcr ingestor-digest resolves before
+                # image-refresh stops silently skipping and fails the Job
+                # loudly (#186 #2). nil-guarded with a `default 3` fallback
+                # so --reuse-values upgrades from pre-this-PR stored
+                # manifests (which lack the key) still render.
+                - name: INGESTOR_RESOLVE_FAIL_THRESHOLD
+                  value: {{ (default dict .Values.imageRefresh).ingestorResolveFailureThreshold | default 3 | quote }}
               securityContext:
                 allowPrivilegeEscalation: false
                 readOnlyRootFilesystem: true
diff --git a/client/tests/image_refresh_test.yaml b/client/tests/image_refresh_test.yaml
index a13c665..42f458d 100644
--- a/client/tests/image_refresh_test.yaml
+++ b/client/tests/image_refresh_test.yaml
@@ -463,6 +463,56 @@ tests:
             name: INGESTOR_TAG
             value: staging
 
+  - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD defaults to "3" (#186 #2)
+    # Below this many consecutive ghcr resolve failures, image-refresh
+    # WARNs + skips; at/above it the script fails the Job loudly instead
+    # of silently leaving jobs-manager on a stale digest.
+    template: templates/image-refresh-cronjob.yaml
+    documentIndex: 1
+    asserts:
+      - contains:
+          path: spec.jobTemplate.spec.template.spec.containers[0].env
+          content:
+            name: INGESTOR_RESOLVE_FAIL_THRESHOLD
+            value: "3"
+
+  - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD falls back to "3" when imageRefresh key is absent
+    # --reuse-values upgrade from a pre-this-PR stored manifest lacks the
+    # key; the template's `| default 3` must still render a usable value.
+    template: templates/image-refresh-cronjob.yaml
+    documentIndex: 1
+    set:
+      imageRefresh:
+        ingestorResolveFailureThreshold: null
+    asserts:
+      - contains:
+          path: spec.jobTemplate.spec.template.spec.containers[0].env
+          content:
+            name: INGESTOR_RESOLVE_FAIL_THRESHOLD
+            value: "3"
+
+  - it: ingestor INGESTOR_RESOLVE_FAIL_THRESHOLD is overridable
+    template: templates/image-refresh-cronjob.yaml
+    documentIndex: 1
+    set:
+      imageRefresh:
+        ingestorResolveFailureThreshold: 5
+    asserts:
+      - contains:
+          path: spec.jobTemplate.spec.template.spec.containers[0].env
+          content:
+            name: INGESTOR_RESOLVE_FAIL_THRESHOLD
+            value: "5"
+
+  - it: schema rejects ingestorResolveFailureThreshold below 1
+    template: templates/image-refresh-cronjob.yaml
+    set:
+      imageRefresh:
+        ingestorResolveFailureThreshold: 0
+    asserts:
+      - failedTemplate:
+          errorPattern: "must not be valid against schema|Must be greater than or equal to 1"
+
   - it: schema rejects ingestor.tag=latest
     template: templates/image-refresh-cronjob.yaml
     set:
diff --git a/client/values.schema.json b/client/values.schema.json
index 937b363..9ba0564 100644
--- a/client/values.schema.json
+++ b/client/values.schema.json
@@ -585,6 +585,12 @@
           "pattern": "^[0-9]+(s|m|h)$",
           "description": "Passed to `kubectl rollout status --timeout`. Allow headroom for bare-metal image pull."
         },
+        "ingestorResolveFailureThreshold": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 3,
+          "description": "Consecutive failed ghcr.io ingestor-digest resolves before image-refresh fails the Job loudly instead of silently skipping. See #186 (#2). Higher tolerates flakier egress; very high keeps the old skip-forever behaviour."
+        },
         "suspend": {
           "type": "boolean",
           "default": false,
diff --git a/client/values.yaml b/client/values.yaml
index 4cf25c8..5e5eda9 100644
--- a/client/values.yaml
+++ b/client/values.yaml
@@ -489,6 +489,17 @@ imageRefresh:
   # on bare-metal first-boot can take minutes, and the CronJob slot
   # shouldn't hold all night if a rollout genuinely wedges.
   rolloutTimeout: "10m"
+  # Consecutive failed ghcr.io ingestor-digest resolves before image-refresh
+  # stops silently skipping and fails the Job loudly (visible in
+  # `kubectl get cronjob` / monitoring, plus a `tracebloc.io/ingestor-refresh-
+  # last-error` annotation on the jobs-manager deployment). Default 3 ≈ 45 min
+  # at the 15-min schedule: long enough to ride out a transient registry blip,
+  # short enough to surface a persistent egress/proxy failure (e.g. a cluster
+  # that reaches docker.io but not ghcr.io — the #186 (#2) root cause). A
+  # failed resolve BELOW this count still just WARNs and skips the tick.
+  # Raise it for flakier egress; set it very high to keep the old
+  # skip-forever behaviour. Only affects the ingestor (ghcr) image.
+  ingestorResolveFailureThreshold: 3
   # CronJob spec knobs. Override per-cluster if needed.
   suspend: false
   # Lower history limits than autoUpgrade — this Job runs ~96x/day, the