diff --git a/Dockerfile b/Dockerfile index 88f6248..44e265f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -219,6 +219,17 @@ RUN groupadd -g 1000 ${AGENT} \ && mkdir -p /home/${AGENT}/.config /home/${AGENT}/workspace \ && chown -R ${AGENT}:${AGENT} /home/${AGENT} +# Per-agent default config baseline. Copied into /etc/-defaults/ +# at build time; the entrypoint applies these BEFORE the ConfigMap +# overlay at /etc/-config/, so a deployment without a ConfigMap +# still gets sensible runtime config and the ConfigMap only needs to +# carry the deltas. +RUN mkdir -p "/etc/${AGENT}-defaults" +COPY defaults/ /etc/defaults-staging/ +RUN if [ -f "/etc/defaults-staging/${AGENT}-config.toml" ]; then \ + cp "/etc/defaults-staging/${AGENT}-config.toml" "/etc/${AGENT}-defaults/config.toml"; \ + fi && rm -rf /etc/defaults-staging + # Entrypoint + bash profile. COPY --chmod=0755 bin/entrypoint.sh /usr/local/bin/entrypoint.sh COPY --chown=${AGENT}:${AGENT} profile/.bashrc /home/${AGENT}/.bashrc diff --git a/bin/entrypoint.sh b/bin/entrypoint.sh index eef43ac..42f165d 100644 --- a/bin/entrypoint.sh +++ b/bin/entrypoint.sh @@ -97,14 +97,56 @@ claude) ;; esac -# Sync managed config from a ConfigMap mounted at /etc/-config/. +# Layer 1 — image-baked defaults at /etc/-defaults/. +# Provide sensible defaults so a deployment without a ConfigMap still +# gets a working runtime config. The ConfigMap overlay below wins on +# any key it also sets. Today this carries the Codex sandbox/approval +# baseline (see defaults/codex-config.toml; OPS-405). +# +# cp -afL: -a recurses + preserves attributes, -L dereferences symlinks. +# Failures exit FATAL rather than being masked — same pattern as the +# ConfigMap overlay below (OPS-406, codex-shell#10). +AGENT_DEFAULTS_DIR="/etc/${AGENT}-defaults" +if [ -d "${AGENT_DEFAULTS_DIR}" ]; then + if ! cp -afL "${AGENT_DEFAULTS_DIR}/." "${AGENT_CONFIG_DIR}/"; then + echo "FATAL: failed to sync image defaults from ${AGENT_DEFAULTS_DIR} to ${AGENT_CONFIG_DIR}" >&2 + exit 1 + fi + chmod -R u+w "${AGENT_CONFIG_DIR}" 2>/dev/null || true + + # Smoke check: if the defaults dir has any files, at least one must + # have landed in the destination. Catches silent permission/path + # failures that would otherwise mask a non-functional baseline. + if [ -n "$(find "${AGENT_DEFAULTS_DIR}" -mindepth 1 -print -quit 2>/dev/null)" ] \ + && [ -z "$(find "${AGENT_CONFIG_DIR}" -mindepth 1 -print -quit 2>/dev/null)" ]; then + echo "FATAL: defaults sync ran but ${AGENT_CONFIG_DIR} is empty" >&2 + exit 1 + fi +fi + +# Layer 2 — managed config from a ConfigMap mounted at /etc/-config/. # The ConfigMap (apk8s repo) is the source of truth for model/MCP config; # in-pod edits get blown away on restart. Stakater Reloader restarts the -# pod when the ConfigMap changes. +# pod when the ConfigMap changes. Per-deployment overrides go here. +# +# cp -afL: -a recurses + preserves attributes, -L dereferences the +# symlink farm that ConfigMap mounts use. The previous `cp -fL` skipped +# subdirectories entirely and silently dropped managed config (OPS-406). if [ -d "${AGENT_CONFIG_SOURCE}" ]; then - # cp -L follows symlinks (configmap mounts are symlink farms). - cp -fL "${AGENT_CONFIG_SOURCE}/." "${AGENT_CONFIG_DIR}/" 2>/dev/null || true + if ! cp -afL "${AGENT_CONFIG_SOURCE}/." "${AGENT_CONFIG_DIR}/"; then + echo "FATAL: failed to sync managed config from ${AGENT_CONFIG_SOURCE} to ${AGENT_CONFIG_DIR}" >&2 + exit 1 + fi chmod -R u+w "${AGENT_CONFIG_DIR}" 2>/dev/null || true + + # Smoke check: if the ConfigMap mount has any files, at least one + # must have landed in the destination. Catches silent + # permission/path failures that previously masked stale config. + if [ -n "$(find "${AGENT_CONFIG_SOURCE}" -mindepth 1 -print -quit 2>/dev/null)" ] \ + && [ -z "$(find "${AGENT_CONFIG_DIR}" -mindepth 1 -print -quit 2>/dev/null)" ]; then + echo "FATAL: managed config sync ran but ${AGENT_CONFIG_DIR} is empty" >&2 + exit 1 + fi fi # Pull nprodromou/agent-config for the canonical Nate-org instructions diff --git a/defaults/codex-config.toml b/defaults/codex-config.toml new file mode 100644 index 0000000..c9c03e1 --- /dev/null +++ b/defaults/codex-config.toml @@ -0,0 +1,37 @@ +# Default Codex CLI runtime config for the codex-shell pod. +# +# Layered with /etc/codex-config (apk8s ConfigMap) at entrypoint time — +# image defaults are applied first, then the ConfigMap overlay wins on +# any key it sets. So this file is the baseline; per-deployment tweaks +# go in apk8s. +# +# Why these values: +# +# - `sandbox_mode = "danger-full-access"`: the pod itself is the +# security boundary (non-root user, restricted RBAC, PVC isolation). +# Codex's internal bubblewrap layer is redundant in this deployment +# and was failing on `bwrap: No permissions to create new namespace` +# in apk8s pods that don't allow unprivileged user namespaces (most +# hardened k8s clusters). Disabling the inner sandbox means commands +# no longer escalate-on-bwrap-failure for every read. +# +# - `approval_policy = "on-failure"`: with the inner sandbox off, no +# sandbox-failure escalations happen. The user only gets prompted +# when a command genuinely fails. Combined with full-access this is +# functionally "no per-command prompts" — appropriate for a trusted +# agent pod, not for an unrestricted user shell. +# +# - The `[projects."/home/codex/workspace"]` trust entry mirrors what +# the live config already had (per OPS-405 description) and makes +# the trust explicit at image-default level. +# +# To tighten later (e.g., re-enable inner sandbox once unprivileged +# user-namespace-clone is enabled at the kubelet/sysctl level), set +# `sandbox_mode = "workspace-write"` in the apk8s ConfigMap; this +# baseline doesn't need to change. + +sandbox_mode = "danger-full-access" +approval_policy = "on-failure" + +[projects."/home/codex/workspace"] +trust_level = "trusted"