From e2577aa89d693bfe31ae0fdd5547521c697281cf Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 1 Jun 2026 21:26:01 +0200 Subject: [PATCH 01/13] feat(installer): --diagnose support bundle (redacted) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a one-command support bundle so a customer hitting an install/runtime problem can send a single file instead of a multi-round log-gathering email thread (the Charité thread is the archetype). `bash <(curl ... i.sh) --diagnose` (or `install-k8s.ps1 -Diagnose`) collects logs + cluster/host status into ~/.tracebloc/tracebloc-diagnose-.tgz. Two guarantees: - Best-effort: the whole collection runs under `set +e` and short-circuits before any install work, so it works even when the install is broken. - Credential-safe: clientPassword, proxy credentials (user:pass@host), and password=/token/secret values are REDACTED from every file before archiving; clientId is kept (it's the identifier support needs, not a secret). scripts/lib/diagnose.sh -- run_diagnose() (host/versions, docker/k3d, kubectl overview + describe of non-Running pods, workload logs with namespace auto-discovery, helm, install log + values.yaml, proxy env) + _redact_file(). install-k8s.sh sources it + adds the --diagnose short-circuit (clears the EXIT trap so the post-install message doesn't fire); install.sh adds it to the bootstrap download manifest. install-k8s.ps1 mirrors with -Diagnose / Invoke-DiagnoseBundle / Edit-Redaction. Documented in --help. Tests: diagnose.bats (7, incl. the end-to-end redaction gate) + Pester (+3). Verified on a Linux VM: the real --diagnose flag produced a 16-file bundle and the seeded dev password + proxy credentials had ZERO occurrences in the archive (clientId kept); also works with no cluster present. Stacked on #171 + #172 + #173 + #174. Co-Authored-By: Claude Opus 4.8 --- scripts/install-k8s.ps1 | 86 ++++++++++++++- scripts/install-k8s.sh | 5 + scripts/install.sh | 1 + scripts/lib/common.sh | 8 +- scripts/lib/diagnose.sh | 160 ++++++++++++++++++++++++++++ scripts/tests/diagnose.bats | 73 +++++++++++++ scripts/tests/install-k8s.Tests.ps1 | 43 ++++++++ 7 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 scripts/lib/diagnose.sh create mode 100644 scripts/tests/diagnose.bats diff --git a/scripts/install-k8s.ps1 b/scripts/install-k8s.ps1 index e1d41dd..9d9347e 100644 --- a/scripts/install-k8s.ps1 +++ b/scripts/install-k8s.ps1 @@ -23,7 +23,7 @@ # ============================================================================= #Requires -Version 5.1 -param([switch]$Help, [switch]$NoReboot) +param([switch]$Help, [switch]$NoReboot, [switch]$Diagnose) # -- Admin check -------------------------------------------------------------- # $env:TB_PESTER lets the test suite dot-source this file to load the functions @@ -1428,6 +1428,89 @@ function Test-Preflight { } } +# ============================================================================= +# DIAGNOSE — `-Diagnose` support bundle (mirrors scripts/lib/diagnose.sh) +# ============================================================================= + +# Redact secrets from a file IN PLACE. Applied to every collected file before +# archiving. Single-quoted replacement strings keep $1 literal for the regex. +# Written UTF-8 without BOM. +function Edit-Redaction([string]$Path) { + if (-not (Test-Path $Path)) { return } + try { + $t = Get-Content -Path $Path -Raw -ErrorAction Stop + $t = $t -replace '(?i)(client[_-]?password\s*[:=]\s*).*', '$1[REDACTED]' + $t = $t -replace '([a-zA-Z][a-zA-Z0-9+.-]*://)[^:/@\s]+:[^@/\s]+@', '$1[REDACTED]@' + $t = $t -replace '(?i)(password\s*=\s*)[^\s&"]*', '$1[REDACTED]' + $t = $t -replace '(?i)((token|secret|authorization|api[_-]?key)\s*[:=]\s*).*', '$1[REDACTED]' + $utf8NoBom = New-Object System.Text.UTF8Encoding($false) + [System.IO.File]::WriteAllText($Path, $t, $utf8NoBom) + } catch {} +} + +function Invoke-DiagnoseBundle { + $ts = Get-Date -Format 'yyyyMMdd-HHmmss' + $base = if ($HOST_DATA_DIR) { $HOST_DATA_DIR } else { "$env:USERPROFILE\.tracebloc" } + $cn = if ($CLUSTER_NAME) { $CLUSTER_NAME } else { "tracebloc" } + New-Item -ItemType Directory -Path $base -Force -ErrorAction SilentlyContinue | Out-Null + $work = Join-Path ([System.IO.Path]::GetTempPath()) ("tracebloc-diag-" + [System.IO.Path]::GetRandomFileName()) + $d = Join-Path $work "tracebloc-diagnose-$ts" + New-Item -ItemType Directory -Path (Join-Path $d "logs") -Force | Out-Null + + Info "Collecting diagnostics -- this is safe; credentials are redacted before the file is written." + + # Namespace discovery (TB_NAMESPACE isn't set on a standalone diagnose run). + $ns = $TB_NAMESPACE + if (-not $ns) { + $jm = kubectl get pods -A 2>$null | Select-String '\-jobs-manager' | Select-Object -First 1 + if ($jm) { $ns = ($jm.ToString().Trim() -split '\s+')[0] } + } + if (-not $ns) { $ns = "default" } + + # host / versions + $h = @("# tracebloc diagnose ($ts)", "OS: Windows ARCH: $(Get-WindowsArch)", + "CLIENT_ENV: $($env:CLIENT_ENV) CLUSTER_NAME: $cn NAMESPACE: $ns", "## versions", + (k3d version 2>&1 | Out-String), (kubectl version --client 2>&1 | Out-String), + (helm version --short 2>&1 | Out-String), (docker version 2>&1 | Out-String)) + try { $cs = Get-CimInstance Win32_ComputerSystem -ErrorAction Stop; $h += "CPUs=$($cs.NumberOfLogicalProcessors) MemBytes=$($cs.TotalPhysicalMemory)" } catch {} + ($h -join "`n") | Out-File (Join-Path $d "00-host.txt") -Encoding utf8 + + ((docker ps -a --filter "name=k3d-$cn-" 2>&1 | Out-String) + "`n" + (k3d cluster list 2>&1 | Out-String)) | Out-File (Join-Path $d "01-docker.txt") -Encoding utf8 + + if (Get-Command kubectl -ErrorAction SilentlyContinue) { + (@("## nodes", (kubectl get nodes -o wide 2>&1 | Out-String), + "## pods", (kubectl get pods -A -o wide 2>&1 | Out-String), + "## events", (kubectl get events -A 2>&1 | Out-String)) -join "`n") | Out-File (Join-Path $d "02-kubectl.txt") -Encoding utf8 + foreach ($w in @("mysql-client", "$ns-jobs-manager", "$ns-requests-proxy")) { + kubectl logs -n $ns "deploy/$w" --all-containers --tail=500 2>&1 | Out-File (Join-Path $d "logs/$w.log") -Encoding utf8 + } + } + if (Get-Command helm -ErrorAction SilentlyContinue) { + (@("## helm list", (helm list -A 2>&1 | Out-String), "## values", (helm get values $ns -n $ns 2>&1 | Out-String)) -join "`n") | Out-File (Join-Path $d "04-helm.txt") -Encoding utf8 + } + + Get-ChildItem -Path $base -Filter "install-*.log" -ErrorAction SilentlyContinue | ForEach-Object { Copy-Item $_.FullName (Join-Path $d $_.Name) -ErrorAction SilentlyContinue } + if (Test-Path "$base\values.yaml") { Copy-Item "$base\values.yaml" (Join-Path $d "values.yaml") -ErrorAction SilentlyContinue } + + (("## proxy env`n") + ((@("HTTP_PROXY","HTTPS_PROXY","NO_PROXY") | ForEach-Object { "$_=" + [Environment]::GetEnvironmentVariable($_) }) -join "`n")) | Out-File (Join-Path $d "05-proxy.txt") -Encoding utf8 + + # REDACT every collected file, THEN archive. + Get-ChildItem -Path $d -Recurse -File | ForEach-Object { Edit-Redaction $_.FullName } + $bundle = Join-Path $base "tracebloc-diagnose-$ts.zip" + if (Test-Path $bundle) { Remove-Item $bundle -Force -ErrorAction SilentlyContinue } + Compress-Archive -Path $d -DestinationPath $bundle -Force -ErrorAction SilentlyContinue + Remove-Item $work -Recurse -Force -ErrorAction SilentlyContinue + + Write-Host "" + if (Test-Path $bundle) { + Ok "Diagnostics saved (credentials redacted):" + Write-Host " $bundle" + Hint "Send this file to tracebloc support -- it has logs + status with passwords removed." + } else { + Write-Host " Could not create the diagnostics archive." -ForegroundColor Red + } +} + # ============================================================================= # MAIN # ============================================================================= @@ -1435,6 +1518,7 @@ function Test-Preflight { if (-not $env:TB_PESTER) { if ($Help) { Print-Help } +if ($Diagnose) { Invoke-DiagnoseBundle; exit 0 } Confirm-Config Initialize-ToolDir diff --git a/scripts/install-k8s.sh b/scripts/install-k8s.sh index 5aa4692..615a701 100755 --- a/scripts/install-k8s.sh +++ b/scripts/install-k8s.sh @@ -50,12 +50,17 @@ source "${LIB_DIR}/cluster.sh" source "${LIB_DIR}/gpu-plugins.sh" source "${LIB_DIR}/install-client-helm.sh" source "${LIB_DIR}/summary.sh" +source "${LIB_DIR}/diagnose.sh" trap install_cleanup EXIT # ── Main ───────────────────────────────────────────────────────────────────── main() { [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && print_help + # Support bundle: collect redacted diagnostics and exit, before any install + # work (so it works even when the install is broken). Clear the EXIT trap so + # the post-install cleanup message doesn't fire after a diagnose run. + [[ "${1:-}" == "--diagnose" ]] && { trap - EXIT; run_diagnose; exit $?; } validate_config setup_log_file diff --git a/scripts/install.sh b/scripts/install.sh index 29a5604..59be5a3 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -43,6 +43,7 @@ FILES=( "scripts/lib/gpu-plugins.sh" "scripts/lib/install-client-helm.sh" "scripts/lib/summary.sh" + "scripts/lib/diagnose.sh" ) download_with_retry() { diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh index 9e1a55c..6a5a979 100755 --- a/scripts/lib/common.sh +++ b/scripts/lib/common.sh @@ -351,7 +351,13 @@ tracebloc — client setup Usage: curl -fsSL https://raw.githubusercontent.com/tracebloc/client/main/scripts/install.sh | bash - ./install-k8s.sh [--help] + ./install-k8s.sh [--help] [--diagnose] + +Commands: + --diagnose Collect a redacted support bundle (logs + cluster/host status) + into ~/.tracebloc/tracebloc-diagnose-.tgz and exit. + Run this if something went wrong, then send the file to support + (passwords and proxy credentials are removed before it is written). Advanced configuration (environment variables): CLUSTER_NAME Cluster name (default: tracebloc) diff --git a/scripts/lib/diagnose.sh b/scripts/lib/diagnose.sh new file mode 100644 index 0000000..c732c70 --- /dev/null +++ b/scripts/lib/diagnose.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# ============================================================================= +# diagnose.sh — `--diagnose` support bundle +# +# Collects logs + cluster/host status into ONE redacted archive the customer +# can send to support, collapsing multi-round triage into a single file. +# +# Two guarantees: +# • Best-effort — works even when the install is broken (the whole collection +# runs under `set +e`; every section is independent). +# • Credential-safe — clientPassword, proxy credentials (user:pass@host), and +# password=/token/secret values are redacted from EVERY file before it is +# archived. clientId is kept (it's the identifier support needs, not a secret). +# +# Side-effect-safe to source (function definitions only). +# ============================================================================= + +# Redact secrets from a file IN PLACE. Applied to every collected file before +# archiving. `sed -i.bak` + `rm .bak` is portable across GNU and BSD/macOS sed. +_redact_file() { + local f="$1" + [[ -f "$f" ]] || return 0 + sed -i.bak -E \ + -e 's/([Cc]lient[_-]?[Pp]assword[[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ + -e 's#([a-zA-Z][a-zA-Z0-9+.-]*://)[^:/@[:space:]]+:[^@/[:space:]]+@#\1[REDACTED]@#g' \ + -e 's/([Pp]assword[[:space:]]*=[[:space:]]*)[^[:space:]&"]*/\1[REDACTED]/g' \ + -e 's/(([Tt]oken|[Ss]ecret|[Aa]uthorization|[Aa]pi[_-]?[Kk]ey)[[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ + "$f" 2>/dev/null + rm -f "${f}.bak" 2>/dev/null +} + +# Redact every regular file under a directory. +_redact_tree() { + local f + while IFS= read -r f; do _redact_file "$f"; done < <(find "$1" -type f 2>/dev/null) +} + +run_diagnose() { + set +e # every step is best-effort — never abort the bundle mid-collection + + local ts base outdir d ns cn + ts="$(date +%Y%m%d-%H%M%S 2>/dev/null)"; [[ -z "$ts" ]] && ts="bundle" + base="${HOST_DATA_DIR:-$HOME/.tracebloc}"; mkdir -p "$base" 2>/dev/null + cn="${CLUSTER_NAME:-tracebloc}" + outdir="$(mktemp -d "${TMPDIR:-/tmp}/tracebloc-diag-XXXXXX" 2>/dev/null)" + if [[ -z "$outdir" || ! -d "$outdir" ]]; then echo " diagnose: cannot create a temp directory" >&2; return 1; fi + d="$outdir/tracebloc-diagnose-$ts"; mkdir -p "$d/logs" + + info "Collecting diagnostics — this is safe; credentials are redacted before the file is written." + + # Namespace discovery — TB_NAMESPACE isn't set on a standalone diagnose run, + # so find the namespace of the jobs-manager pod (falls back to "default"). + ns="${TB_NAMESPACE:-}" + if [[ -z "$ns" ]] && has kubectl; then + ns="$(kubectl get pods -A 2>/dev/null | awk '/-jobs-manager/{print $1; exit}')" + fi + [[ -z "$ns" ]] && ns="default" + + # ── host / versions ── + { + echo "# tracebloc diagnose ($ts)" + echo "OS: $(uname -s) $(uname -r)" + echo "ARCH: $(uname -m)" + echo "CLIENT_ENV: ${CLIENT_ENV:-} CLUSTER_NAME: $cn NAMESPACE: $ns" + echo; echo "## versions" + has k3d && k3d version + has kubectl && kubectl version --client 2>/dev/null + has helm && helm version --short 2>/dev/null + has docker && docker version 2>/dev/null + echo; echo "## cpu / mem / disk" + if [[ "$(uname -s)" == "Darwin" ]]; then + echo "ncpu=$(sysctl -n hw.ncpu 2>/dev/null) memsize=$(sysctl -n hw.memsize 2>/dev/null)" + else + echo "nproc=$(nproc 2>/dev/null)"; grep -i MemTotal /proc/meminfo 2>/dev/null + fi + df -h 2>/dev/null | head -20 + if has docker; then + echo; echo "## docker info" + docker info 2>/dev/null | grep -iE 'Server Version|Storage Driver|Docker Root|Operating System|Total Memory|CPUs|Cgroup' + fi + } > "$d/00-host.txt" 2>&1 + + # ── docker / k3d ── + { + echo "## docker ps -a (k3d nodes)" + has docker && docker ps -a --filter "name=k3d-${cn}-" --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' + echo; echo "## k3d cluster list" + has k3d && k3d cluster list + echo; echo "## node restart policy + proxy env" + if has docker; then + for c in $(docker ps -a --filter "name=k3d-${cn}-" --format '{{.Names}}' 2>/dev/null); do + echo "### $c" + docker inspect "$c" --format 'RestartPolicy={{.HostConfig.RestartPolicy.Name}}' 2>/dev/null + docker inspect "$c" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null | grep -iE 'PROXY' + done + fi + } > "$d/01-docker.txt" 2>&1 + + # ── kubectl overview + per-pod detail ── + if has kubectl; then + { + echo "## nodes"; kubectl get nodes -o wide 2>&1 + echo; echo "## pods (all namespaces)"; kubectl get pods -A -o wide 2>&1 + echo; echo "## workloads"; kubectl get deploy,ds,sts -A 2>&1 + echo; echo "## recent events"; kubectl get events -A --sort-by=.lastTimestamp 2>&1 | tail -120 + } > "$d/02-kubectl.txt" 2>&1 + { + echo "## describe of non-Running pods in namespace '$ns'" + for p in $(kubectl get pods -n "$ns" --no-headers 2>/dev/null | awk '$3!="Running" && $3!="Completed"{print $1}'); do + echo; echo "### $p"; kubectl describe pod -n "$ns" "$p" 2>&1 + done + } > "$d/03-describe.txt" 2>&1 + # workload logs (current + previous) + local w + for w in mysql-client "${ns}-jobs-manager" "${ns}-requests-proxy"; do + kubectl logs -n "$ns" "deploy/$w" --all-containers --tail=500 > "$d/logs/${w}.log" 2>&1 + kubectl logs -n "$ns" "deploy/$w" --all-containers --previous --tail=500 > "$d/logs/${w}.previous.log" 2>&1 + done + kubectl logs -n "$ns" "daemonset/tracebloc-resource-monitor" --tail=300 > "$d/logs/resource-monitor.log" 2>&1 + fi + + # ── helm (redacted afterwards) ── + if has helm; then + { + echo "## helm list -A"; helm list -A 2>&1 + echo; echo "## helm get values $ns"; helm get values "$ns" -n "$ns" 2>&1 + echo; echo "## helm get manifest $ns (truncated)"; helm get manifest "$ns" -n "$ns" 2>&1 | head -200 + } > "$d/04-helm.txt" 2>&1 + fi + + # ── install artifacts (copied, redacted afterwards) ── + cp "$base"/install-*.log "$d/" 2>/dev/null + [[ -f "$base/values.yaml" ]] && cp "$base/values.yaml" "$d/values.yaml" 2>/dev/null + + # ── proxy / env ── + { + echo "## proxy environment" + local v + for v in HTTP_PROXY HTTPS_PROXY NO_PROXY http_proxy https_proxy no_proxy; do + echo "$v=${!v:-}" + done + } > "$d/05-proxy.txt" 2>&1 + + # ── REDACT every collected file, THEN archive ── + _redact_tree "$d" + + local bundle="$base/tracebloc-diagnose-$ts.tgz" + tar -czf "$bundle" -C "$outdir" "tracebloc-diagnose-$ts" 2>/dev/null + rm -rf "$outdir" 2>/dev/null + + echo "" + if [[ -f "$bundle" ]]; then + success "Diagnostics saved (credentials redacted):" + echo " $bundle" + hint "Send this file to tracebloc support — it has logs + status with passwords removed." + return 0 + fi + echo " Could not create the diagnostics archive." >&2 + return 1 +} diff --git a/scripts/tests/diagnose.bats b/scripts/tests/diagnose.bats new file mode 100644 index 0000000..ef70ae8 --- /dev/null +++ b/scripts/tests/diagnose.bats @@ -0,0 +1,73 @@ +#!/usr/bin/env bats +# Tests for scripts/lib/diagnose.sh — the --diagnose support bundle. +# The redaction tests are the SECURITY GATE: a known secret must never survive +# into the bundle the customer sends to support. +load test_helper + +setup() { + load_lib diagnose.sh + HOST_DATA_DIR="$BATS_TEST_TMPDIR/tb" + CLUSTER_NAME=tracebloc + mkdir -p "$HOST_DATA_DIR" +} + +# ── _redact_file (security) ───────────────────────────────────────────────── +@test "_redact_file: clientPassword redacted, clientId kept" { + f="$BATS_TEST_TMPDIR/v.yaml" + printf 'clientId: "abc-123"\nclientPassword: '\''S3cr3tP@ss'\''\n' > "$f" + _redact_file "$f" + ! grep -q 'S3cr3tP@ss' "$f" + grep -q 'clientPassword: \[REDACTED\]' "$f" + grep -q 'abc-123' "$f" +} + +@test "_redact_file: proxy credentials redacted" { + f="$BATS_TEST_TMPDIR/p.txt" + echo 'HTTP_PROXY=http://user:s3cr3t@proxy.corp:8080' > "$f" + _redact_file "$f" + ! grep -q 's3cr3t' "$f" + grep -q 'http://\[REDACTED\]@proxy.corp:8080' "$f" +} + +@test "_redact_file: password= and token/secret redacted" { + f="$BATS_TEST_TMPDIR/l.txt" + printf 'POST password=hunter2&x=1\ntoken: ghp_SECRETTOKEN\n' > "$f" + _redact_file "$f" + ! grep -q 'hunter2' "$f" + ! grep -q 'ghp_SECRETTOKEN' "$f" +} + +@test "_redact_file: non-secret content left intact" { + f="$BATS_TEST_TMPDIR/n.txt" + echo 'NO_PROXY=localhost,127.0.0.1,.svc' > "$f" + _redact_file "$f" + grep -q '127.0.0.1,.svc' "$f" +} + +@test "_redact_file: missing file is a no-op (no error)" { + run _redact_file "$BATS_TEST_TMPDIR/nope.txt" + [ "$status" -eq 0 ] +} + +# ── run_diagnose (end-to-end, the headline security proof) ────────────────── +@test "run_diagnose: produces a bundle, and a seeded secret does NOT survive in it" { + echo "clientPassword: 'LEAKME123'" > "$HOST_DATA_DIR/values.yaml" + echo "installer log line" > "$HOST_DATA_DIR/install-20260101-000000.log" + has() { return 1; } # no kubectl/docker/helm -> best-effort path + run run_diagnose + [ "$status" -eq 0 ] + [[ "$output" == *"Diagnostics saved"* ]] + tgz="$(ls "$HOST_DATA_DIR"/tracebloc-diagnose-*.tgz 2>/dev/null | head -1)" + [ -n "$tgz" ] + # extract to stdout and confirm the secret was redacted before archiving + ! tar -xzOf "$tgz" 2>/dev/null | grep -q 'LEAKME123' + # but the bundle still contains useful content (the host section) + tar -tzf "$tgz" 2>/dev/null | grep -q '00-host.txt' +} + +@test "run_diagnose: best-effort with no cluster (does not crash)" { + has() { return 1; } + run run_diagnose + [ "$status" -eq 0 ] + [[ "$output" == *"Diagnostics saved"* ]] +} diff --git a/scripts/tests/install-k8s.Tests.ps1 b/scripts/tests/install-k8s.Tests.ps1 index e1cebc6..c978bc3 100644 --- a/scripts/tests/install-k8s.Tests.ps1 +++ b/scripts/tests/install-k8s.Tests.ps1 @@ -8,6 +8,8 @@ BeforeAll { # Stubs so Pester can mock external commands that the functions invoke. function kubectl { } function docker { } + function helm { } + function k3d { } } Describe "Get-BackendUrl" { @@ -378,3 +380,44 @@ Describe "Set-ClusterAutostart" { Should -Invoke docker -Times 0 -Exactly } } + +# --- diagnose support bundle (mirrors scripts/lib/diagnose.sh) --------------- +Describe "Edit-Redaction" { + It "redacts clientPassword / proxy creds / token; keeps clientId + NO_PROXY" { + $f = Join-Path $TestDrive "v.txt" + @" +clientId: "abc-123" +clientPassword: 'S3cr3tP@ss' +HTTP_PROXY=http://user:s3cr3t@proxy:8080 +token: ghp_SECRET +NO_PROXY=localhost,127.0.0.1 +"@ | Set-Content $f + Edit-Redaction $f + $c = Get-Content $f -Raw + $c | Should -Not -Match 'S3cr3tP@ss' + $c | Should -Not -Match 's3cr3t' + $c | Should -Not -Match 'ghp_SECRET' + $c | Should -Match 'abc-123' + $c | Should -Match '127\.0\.0\.1' + } + It "missing file -> no throw" { + { Edit-Redaction (Join-Path $TestDrive "nope.txt") } | Should -Not -Throw + } +} + +Describe "Invoke-DiagnoseBundle" { + It "produces a bundle and a seeded secret does NOT survive in it" { + $HOST_DATA_DIR = Join-Path $TestDrive "tb" + New-Item -ItemType Directory -Path $HOST_DATA_DIR -Force | Out-Null + "clientPassword: 'LEAKME123'" | Set-Content (Join-Path $HOST_DATA_DIR "values.yaml") + Mock kubectl { "" }; Mock docker { "" }; Mock helm { "" }; Mock k3d { "" } + Mock Get-WindowsArch { "amd64" } # avoid the PROCESSOR_ARCHITECTURE Err off-Windows + { Invoke-DiagnoseBundle } | Should -Not -Throw + $zip = Get-ChildItem $HOST_DATA_DIR -Filter 'tracebloc-diagnose-*.zip' | Select-Object -First 1 + $zip | Should -Not -BeNullOrEmpty + $ex = Join-Path $TestDrive "ex" + Expand-Archive -Path $zip.FullName -DestinationPath $ex -Force + $all = (Get-ChildItem $ex -Recurse -File | ForEach-Object { Get-Content $_.FullName -Raw }) -join "`n" + $all | Should -Not -Match 'LEAKME123' + } +} From 8e39a50646e40e063a70ad5a248e67ff226b748a Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 1 Jun 2026 21:39:03 +0200 Subject: [PATCH 02/13] test(installer): raise changed-line coverage (diagnose collection + real preflight probes) Measured changed-line coverage of the stack had dropped (bash ~84% vs #171's ~96%) because the new code added integration-only branches the mocked unit suites skipped. Recover the unit-testable portion: - diagnose.bats: exercise the kubectl/docker/helm collection path (has()=true + mocked tools) -> diagnose.sh 64% -> 90%. - preflight.bats: test the REAL _pf_probe_url curl-exit-code -> token mapping, the missing-curl path, and the _pf_ncpu/_pf_total_mem_kb/_pf_free_kb readers (re-sourced past the setup stubs) -> preflight.sh 79% -> 90%. Bash changed-line coverage: 83.6% -> 92.3% (kcov, 383/415). The residual ~8% is integration-only (real k3d/docker create + macOS/Windows-specific branches + MAIN orchestration), validated by the live VM E2Es (reboot recovery, auth-proxy, preflight blocked-egress/arm64, diagnose redaction grep). Co-Authored-By: Claude Opus 4.8 --- scripts/tests/diagnose.bats | 20 ++++++++++++++++++++ scripts/tests/preflight.bats | 26 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/scripts/tests/diagnose.bats b/scripts/tests/diagnose.bats index ef70ae8..0dbfc98 100644 --- a/scripts/tests/diagnose.bats +++ b/scripts/tests/diagnose.bats @@ -71,3 +71,23 @@ setup() { [ "$status" -eq 0 ] [[ "$output" == *"Diagnostics saved"* ]] } + +@test "run_diagnose: exercises the cluster-data collection when tools are present" { + has() { return 0; } # kubectl/docker/helm "present" + kubectl() { + case "$*" in + *"get pods -A"*) printf 'default default-jobs-manager-abc 1/1 Running\n' ;; + *) printf 'kubectl %s\n' "$*" ;; + esac + } + docker() { printf 'docker %s\n' "$*"; } + helm() { printf 'helm %s\n' "$*"; } + run run_diagnose + [ "$status" -eq 0 ] + tgz="$(ls "$HOST_DATA_DIR"/tracebloc-diagnose-*.tgz 2>/dev/null | head -1)" + [ -n "$tgz" ] + # the kubectl + helm + per-workload-log collection branches ran + tar -tzf "$tgz" | grep -q '02-kubectl.txt' + tar -tzf "$tgz" | grep -q '04-helm.txt' + tar -tzf "$tgz" | grep -q 'logs/mysql-client.log' +} diff --git a/scripts/tests/preflight.bats b/scripts/tests/preflight.bats index 9ec41f9..d1d5ec0 100644 --- a/scripts/tests/preflight.bats +++ b/scripts/tests/preflight.bats @@ -161,3 +161,29 @@ setup() { run run_preflight [ "$status" -eq 0 ] } + +# ── real _pf_probe_url + readers (setup() stubs them; re-source for the real ones) ── +@test "_pf_probe_url: maps curl outcomes to tokens" { + source "${BATS_TEST_DIRNAME}/../lib/preflight.sh" # restore the real function + has() { return 0; } # 'has curl' true + curl() { return 6; }; run _pf_probe_url https://x; [ "$output" = "dns" ] + curl() { return 7; }; run _pf_probe_url https://x; [ "$output" = "refused" ] + curl() { return 28; }; run _pf_probe_url https://x; [ "$output" = "timeout" ] + curl() { return 60; }; run _pf_probe_url https://x; [ "$output" = "tls" ] + curl() { printf '200'; return 0;};run _pf_probe_url https://x; [ "$output" = "ok" ] +} + +@test "_pf_probe_url: missing curl -> nocurl" { + source "${BATS_TEST_DIRNAME}/../lib/preflight.sh" + has() { return 1; } + run _pf_probe_url https://x + [ "$output" = "nocurl" ] +} + +@test "_pf readers return a number on this host" { + source "${BATS_TEST_DIRNAME}/../lib/preflight.sh" + OS="$(uname -s)" + run _pf_ncpu; [[ "$output" =~ ^[0-9]+$ ]] + run _pf_total_mem_kb; [[ "$output" =~ ^[0-9]+$ ]] + run _pf_free_kb /; [[ "$output" =~ ^[0-9]+$ ]] +} From 3b68b53839769b4f2d6455d3d88b1d08acd2ad22 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 1 Jun 2026 22:03:25 +0200 Subject: [PATCH 03/13] fix(installer): close two redaction gaps in --diagnose (security review) A pre-merge security review of the support bundle found two ways credentials could land in the (supposedly redacted) archive the customer sends to support: 1. Redaction only matched `clientPassword:` and `password=` -- it missed any other *password key in colon form, so `dockerRegistry.password` (a registry token) and `HTTP_PROXY_PASSWORD` survived. Broadened _redact_file (bash) and Edit-Redaction (ps1) to redact ANY *password key, case-insensitive, in : or = form (portable explicit char classes -- BSD sed has no I flag). 2. (bash only) The bundle collected `helm get manifest`, which renders the k8s Secret objects with base64-encoded CLIENT_PASSWORD + .dockerconfigjson that text redaction can't see. Dropped the manifest collection (helm get values + kubectl output already cover triage). Regression tests added (diagnose.bats + Pester). Re-verified end-to-end on a Linux VM with the real --diagnose flag: clientPassword, the dockerRegistry token, HTTP_PROXY_PASSWORD, and proxy URL creds all have ZERO occurrences in the archive; clientId kept; manifest no longer collected. Co-Authored-By: Claude Opus 4.8 --- scripts/install-k8s.ps1 | 5 +++-- scripts/lib/diagnose.sh | 14 ++++++++++---- scripts/tests/diagnose.bats | 13 +++++++++++++ scripts/tests/install-k8s.Tests.ps1 | 12 ++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/scripts/install-k8s.ps1 b/scripts/install-k8s.ps1 index 9d9347e..2f9c310 100644 --- a/scripts/install-k8s.ps1 +++ b/scripts/install-k8s.ps1 @@ -1439,9 +1439,10 @@ function Edit-Redaction([string]$Path) { if (-not (Test-Path $Path)) { return } try { $t = Get-Content -Path $Path -Raw -ErrorAction Stop - $t = $t -replace '(?i)(client[_-]?password\s*[:=]\s*).*', '$1[REDACTED]' + # First rule redacts ANY *password key (clientPassword, dockerRegistry + # password, HTTP_PROXY_PASSWORD, ...) in : or = form, not just clientPassword. + $t = $t -replace '(?i)([A-Za-z0-9_.-]*password\s*[:=]\s*).*', '$1[REDACTED]' $t = $t -replace '([a-zA-Z][a-zA-Z0-9+.-]*://)[^:/@\s]+:[^@/\s]+@', '$1[REDACTED]@' - $t = $t -replace '(?i)(password\s*=\s*)[^\s&"]*', '$1[REDACTED]' $t = $t -replace '(?i)((token|secret|authorization|api[_-]?key)\s*[:=]\s*).*', '$1[REDACTED]' $utf8NoBom = New-Object System.Text.UTF8Encoding($false) [System.IO.File]::WriteAllText($Path, $t, $utf8NoBom) diff --git a/scripts/lib/diagnose.sh b/scripts/lib/diagnose.sh index c732c70..ed74adf 100644 --- a/scripts/lib/diagnose.sh +++ b/scripts/lib/diagnose.sh @@ -20,11 +20,14 @@ _redact_file() { local f="$1" [[ -f "$f" ]] || return 0 + # Case-insensitive via explicit classes (BSD/macOS sed has no `I` flag). The + # first rule redacts ANY *password key (clientPassword, dockerRegistry + # `password:`, HTTP_PROXY_PASSWORD, …) in either `:` or `=` form — not just + # clientPassword — so registry/proxy/db passwords don't leak into the bundle. sed -i.bak -E \ - -e 's/([Cc]lient[_-]?[Pp]assword[[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ + -e 's/([A-Za-z0-9_.-]*[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd][[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ -e 's#([a-zA-Z][a-zA-Z0-9+.-]*://)[^:/@[:space:]]+:[^@/[:space:]]+@#\1[REDACTED]@#g' \ - -e 's/([Pp]assword[[:space:]]*=[[:space:]]*)[^[:space:]&"]*/\1[REDACTED]/g' \ - -e 's/(([Tt]oken|[Ss]ecret|[Aa]uthorization|[Aa]pi[_-]?[Kk]ey)[[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ + -e 's/(([Tt][Oo][Kk][Ee][Nn]|[Ss][Ee][Cc][Rr][Ee][Tt]|[Aa][Uu][Tt][Hh][Oo][Rr][Ii][Zz][Aa][Tt][Ii][Oo][Nn]|[Aa][Pp][Ii][_-]?[Kk][Ee][Yy])[[:space:]]*[:=][[:space:]]*).*/\1[REDACTED]/' \ "$f" 2>/dev/null rm -f "${f}.bak" 2>/dev/null } @@ -121,10 +124,13 @@ run_diagnose() { # ── helm (redacted afterwards) ── if has helm; then + # NOTE: deliberately NOT collecting `helm get manifest` — it renders the + # Secret objects with base64-encoded credentials (CLIENT_PASSWORD, + # .dockerconfigjson), which the text redaction can't see. `helm get values` + # (input values, redacted) + the kubectl output already cover triage. { echo "## helm list -A"; helm list -A 2>&1 echo; echo "## helm get values $ns"; helm get values "$ns" -n "$ns" 2>&1 - echo; echo "## helm get manifest $ns (truncated)"; helm get manifest "$ns" -n "$ns" 2>&1 | head -200 } > "$d/04-helm.txt" 2>&1 fi diff --git a/scripts/tests/diagnose.bats b/scripts/tests/diagnose.bats index 0dbfc98..93f4b97 100644 --- a/scripts/tests/diagnose.bats +++ b/scripts/tests/diagnose.bats @@ -44,6 +44,17 @@ setup() { grep -q '127.0.0.1,.svc' "$f" } +# Finding 1 (security review): any *password key must be redacted, not just +# clientPassword — covers dockerRegistry password, HTTP_PROXY_PASSWORD, caps. +@test "_redact_file: redacts dockerRegistry/proxy/db password keys (: and =, any case)" { + f="$BATS_TEST_TMPDIR/g.yaml" + printf 'dockerRegistry:\n password: dckr_REGTOKEN\nHTTP_PROXY_PASSWORD: PROXYPW123\nMYSQL_ROOT_PASSWORD=ROOTPW123\n' > "$f" + _redact_file "$f" + ! grep -q 'dckr_REGTOKEN' "$f" + ! grep -q 'PROXYPW123' "$f" + ! grep -q 'ROOTPW123' "$f" +} + @test "_redact_file: missing file is a no-op (no error)" { run _redact_file "$BATS_TEST_TMPDIR/nope.txt" [ "$status" -eq 0 ] @@ -90,4 +101,6 @@ setup() { tar -tzf "$tgz" | grep -q '02-kubectl.txt' tar -tzf "$tgz" | grep -q '04-helm.txt' tar -tzf "$tgz" | grep -q 'logs/mysql-client.log' + # Finding 2 (security review): `helm get manifest` (base64 Secrets) is NOT collected + ! tar -xzOf "$tgz" 2>/dev/null | grep -q 'get manifest' } diff --git a/scripts/tests/install-k8s.Tests.ps1 b/scripts/tests/install-k8s.Tests.ps1 index c978bc3..a092fad 100644 --- a/scripts/tests/install-k8s.Tests.ps1 +++ b/scripts/tests/install-k8s.Tests.ps1 @@ -400,6 +400,18 @@ NO_PROXY=localhost,127.0.0.1 $c | Should -Match 'abc-123' $c | Should -Match '127\.0\.0\.1' } + It "redacts any *password key (dockerRegistry password, HTTP_PROXY_PASSWORD)" { + $f = Join-Path $TestDrive "g.txt" + @" +dockerRegistry: + password: dckr_REGTOKEN +HTTP_PROXY_PASSWORD: PROXYPW123 +"@ | Set-Content $f + Edit-Redaction $f + $c = Get-Content $f -Raw + $c | Should -Not -Match 'dckr_REGTOKEN' + $c | Should -Not -Match 'PROXYPW123' + } It "missing file -> no throw" { { Edit-Redaction (Join-Path $TestDrive "nope.txt") } | Should -Not -Throw } From f5b728cd65f87b474656812144fad955ad059df4 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 1 Jun 2026 22:21:39 +0200 Subject: [PATCH 04/13] fix(installer): correctness fixes from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A pre-merge correctness review (high effort) over the stack found three real bugs — the failure paths weren't exercised by the passing tests: 1. cluster.sh: `k3d "${K3D_ARGS[@]}" > ...` was a bare command under `set -e`, so a k3d-create FAILURE aborted the script immediately, skipping the 'already exists' graceful reuse, the error dump, AND the proxy temp-dir cleanup. Capture rc set-e-safely (`&& create_rc=0 || create_rc=$?`). Proven under set -e: both the error-dump and reuse paths now run. 2. summary.sh: CLIENT_STATE was defaulted to "starting" at source time, so install_cleanup's `[[ -z "$CLIENT_STATE" ]]` guard was always false and the "did not complete / check the log / safe to re-run" hint never printed on an early failure (preflight / docker / cluster / helm). Default it empty; the readiness gate sets the real state. 3. preflight.sh: with curl absent (direct ./install-k8s.sh on a minimal VM, before install_system_deps adds curl), the connectivity probes returned 'nocurl' and hard-failed with a misleading "egress blocked". Skip the check with a warning when curl isn't present yet. Also defensively quote `switch ("$env:CLIENT_ENV")` in Get-BackendUrl so the prod default fires regardless of PowerShell version (refuted as a live bug on pwsh 7 -- default does fire -- but cheap insurance for the unvalidated 5.1 path). Tests: +nocurl-skip test, fixed an over-blunt has() mock; bats 120 / Pester 52 green. (#1's set -e abort can't be caught by bats -- no set -e there -- so it was verified manually under set -e.) Co-Authored-By: Claude Opus 4.8 --- scripts/install-k8s.ps1 | 4 +++- scripts/lib/cluster.sh | 6 ++++-- scripts/lib/preflight.sh | 7 +++++++ scripts/lib/summary.sh | 5 ++++- scripts/tests/preflight.bats | 11 ++++++++++- 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/scripts/install-k8s.ps1 b/scripts/install-k8s.ps1 index 2f9c310..67dcfbb 100644 --- a/scripts/install-k8s.ps1 +++ b/scripts/install-k8s.ps1 @@ -965,7 +965,9 @@ function Get-TraceblocYamlValue { # Resolve the backend base URL the same way jobs-manager does # (client-runtime/controller.py: CLIENT_ENV -> backend), defaulting to prod. function Get-BackendUrl { - switch ($env:CLIENT_ENV) { + # Quote the value so a truly-unset CLIENT_ENV ($null) coerces to "" and the + # default (prod) branch reliably fires across PowerShell versions. + switch ("$env:CLIENT_ENV") { "dev" { return "https://dev-api.tracebloc.io/" } "stg" { return "https://stg-api.tracebloc.io/" } default { return "https://api.tracebloc.io/" } diff --git a/scripts/lib/cluster.sh b/scripts/lib/cluster.sh index e8c3541..ccb22df 100755 --- a/scripts/lib/cluster.sh +++ b/scripts/lib/cluster.sh @@ -291,8 +291,10 @@ _create_new_cluster() { local create_out create_rc create_out="$(mktemp)" - k3d "${K3D_ARGS[@]}" >"$create_out" 2>&1 - create_rc=$? + # Capture the exit code WITHOUT tripping `set -e`: a bare failing command here + # would abort the script immediately, skipping the 'already exists' reuse path, + # the error dump, and the temp-dir cleanup below. + k3d "${K3D_ARGS[@]}" >"$create_out" 2>&1 && create_rc=0 || create_rc=$? [[ -n "$proxy_cfg" ]] && rm -rf "${proxy_cfg%/*}" if [[ $create_rc -ne 0 ]]; then if grep -qi "already exists\|a cluster with that name already exists" "$create_out" 2>/dev/null; then diff --git a/scripts/lib/preflight.sh b/scripts/lib/preflight.sh index 2e624e4..2e1df9f 100644 --- a/scripts/lib/preflight.sh +++ b/scripts/lib/preflight.sh @@ -170,6 +170,13 @@ _pf_disk() { _pf_connectivity() { info "Checking outbound connectivity to required services..." + # Can't probe without curl — and on the direct ./install-k8s.sh path the + # installer hasn't installed it yet. Skip with a warning rather than hard-fail + # with a misleading "egress blocked" (curl is installed downstream). + if ! has curl; then + warn "Skipping connectivity check — curl isn't available yet (the installer will add it)." + return 0 + fi local backend_host cfail=0 tls_seen=0 c label url status backend_host="$(_pf_backend_host)" diff --git a/scripts/lib/summary.sh b/scripts/lib/summary.sh index 8e72ba5..3a501bb 100755 --- a/scripts/lib/summary.sh +++ b/scripts/lib/summary.sh @@ -18,7 +18,10 @@ _log_cluster_status() { # CLIENT_STATE so the summary reports the truth instead of an unconditional # "installed successfully": # connected | starting | bad_creds | image_pull | crash -CLIENT_STATE="starting" +# Empty until wait_for_client_ready runs — so install_cleanup can distinguish an +# early failure (before the readiness gate, CLIENT_STATE still empty) from a +# reported outcome, and still print the "check the log / safe to re-run" hint. +CLIENT_STATE="" READY_TIMEOUT="${READY_TIMEOUT:-300}" wait_for_client_ready() { diff --git a/scripts/tests/preflight.bats b/scripts/tests/preflight.bats index d1d5ec0..a852c11 100644 --- a/scripts/tests/preflight.bats +++ b/scripts/tests/preflight.bats @@ -86,7 +86,7 @@ setup() { @test "_pf_connectivity: tool host probed (warn-only) when the tool is missing" { _pf_probe_url() { case "$1" in *get.docker.com*) echo blocked ;; *) echo ok ;; esac; } - has() { return 1; } + has() { [[ "$1" == "curl" ]]; } # curl present (probing possible), other tools missing OS=Linux run _pf_connectivity [[ "$output" == *"get.docker.com"* ]] @@ -187,3 +187,12 @@ setup() { run _pf_total_mem_kb; [[ "$output" =~ ^[0-9]+$ ]] run _pf_free_kb /; [[ "$output" =~ ^[0-9]+$ ]] } + +# Code review: curl absent must SKIP connectivity (curl is installed downstream), +# not hard-fail with a misleading "egress blocked". +@test "_pf_connectivity: no curl -> warn + skip, not a hard fail" { + has() { return 1; } + run _pf_connectivity + [[ "$output" == *"Skipping connectivity"* ]] + PF_HARD_FAIL=0; _pf_connectivity >/dev/null 2>&1; [ "$PF_HARD_FAIL" -eq 0 ] +} From 33a1edb85bb94d3606d1dc8a2c71a358cd49bda2 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 08:59:34 +0200 Subject: [PATCH 05/13] fix(installer): handle a Docker daemon that won't start (stop misdiagnosing it as a group issue) Asad's AlmaLinux 9 / EC2 test: docker-ce installed fine but `dockerd` crashed on startup (exit 1), systemd throttled it ("Start request repeated too quickly"), and the installer then printed "Could not connect to Docker -- try logging out and back in" -- the GROUP-not-active hint, which is wrong for a dead daemon and sent him in circles (logout/login didn't help). The throttle also means a bare re-run can't recover. install_docker_engine now: - `systemctl enable docker` WITHOUT `--now` (a start failure no longer hard-aborts the script under `set -e` at that line); - `systemctl reset-failed docker` before starting, so a throttled/failed unit from a prior attempt can be retried (a plain re-run now recovers); - when `docker info` fails AND the daemon isn't active (vs. the group-not-active case, which is still re-exec'd via `sg docker`), surface Docker's OWN error (systemctl status + the journalctl error lines) with likely RHEL/AlmaLinux causes, instead of the misleading group hint. Test: setup-linux.bats daemon-won't-start case; bats 121 green. NOTE: this fixes the installer's HANDLING. Asad's root cause (why dockerd exits 1 on that box) is still masked by the systemd throttle and is being chased separately. Co-Authored-By: Claude Opus 4.8 --- scripts/lib/setup-linux.sh | 24 +++++++++++++++++++++++- scripts/tests/setup-linux.bats | 15 +++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/scripts/lib/setup-linux.sh b/scripts/lib/setup-linux.sh index c934124..5c7c6a3 100755 --- a/scripts/lib/setup-linux.sh +++ b/scripts/lib/setup-linux.sh @@ -41,21 +41,43 @@ install_docker_engine() { spin_cmd "Installing Docker…" sudo bash "$docker_script" rm -f "$docker_script" fi - sudo systemctl enable --now docker + # Enable for boot only (no --now): starting is handled below, where a start + # failure is diagnosed instead of aborting the whole script under `set -e`. + sudo systemctl enable docker >/dev/null 2>&1 || true sudo usermod -aG docker "$USER" success "Docker" else success "Docker" fi + # Clear any failed/throttled state from a previous attempt first — a crashed + # daemon leaves the unit in "Start request repeated too quickly", which makes + # systemctl refuse a plain start (so a bare re-run can never recover). Both + # commands are best-effort; the `docker info` check below is the real gate. + sudo systemctl reset-failed docker 2>/dev/null || true sudo systemctl start docker 2>/dev/null || true if ! docker info &>/dev/null 2>&1; then + # (a) Group not active in THIS shell yet → re-exec under the docker group. if [[ -z "${_K3S_INSTALL_REEXEC:-}" ]] && id -nG "$USER" 2>/dev/null | grep -qw docker; then SELF="$(readlink -f "$0" 2>/dev/null || echo "$0")" log "Docker group not yet active in this session — re-executing script..." exec sg docker -c "_K3S_INSTALL_REEXEC=1 bash '$SELF'" fi + # (b) The daemon itself isn't running → a Docker/host problem, not a group + # one. Surface Docker's OWN error (a 'log out and back in' hint would just + # send the user in circles, as it can't fix a crashing daemon). + if ! sudo systemctl is-active --quiet docker 2>/dev/null; then + echo "" + warn "Docker is installed, but its daemon won't start — this is a Docker/host issue, not tracebloc." + hint "Common causes on RHEL/AlmaLinux: SELinux or iptables/nftables init, an overlay" + hint "storage-driver problem, or too little space on /var/lib/docker. Docker's own error:" + { sudo systemctl status docker.service --no-pager -l 2>&1 | tail -6 + sudo journalctl -u docker.service --no-pager 2>/dev/null \ + | grep -iE 'level=(error|fatal)|failed to|cannot |unable |no such' | tail -12; } | sed 's/^/ /' + echo "" + error "Start Docker manually (fix the error above), then re-run this installer." + fi error "Could not connect to Docker. Try logging out and back in, then re-run the script." fi log "Docker daemon running." diff --git a/scripts/tests/setup-linux.bats b/scripts/tests/setup-linux.bats index 86b6548..783c343 100644 --- a/scripts/tests/setup-linux.bats +++ b/scripts/tests/setup-linux.bats @@ -135,3 +135,18 @@ setup() { run mock_calls [ -z "$output" ] } + +# ── install_docker_engine: dead daemon vs group-not-active (Asad's Alma9 case) ── +@test "install_docker_engine: daemon won't start -> Docker's error, not the group hint" { + PRESENT_CMDS="docker" # docker present -> skip install + docker() { return 1; } # docker info fails + id() { echo "testuser"; } # NOT in docker group -> no sg re-exec + sudo() { + case "$*" in *"is-active"*) return 1 ;; esac # daemon not active + record "sudo $*"; return 0 + } + run install_docker_engine + [ "$status" -ne 0 ] + [[ "$output" == *"daemon won't start"* ]] + [[ "$output" != *"logging out"* ]] # the misleading group hint is NOT used +} From f66095b7dc00694e79349c69b6fc3a552522ef4a Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 09:07:22 +0200 Subject: [PATCH 06/13] fix(installer): load the kernel modules Docker/k3s need (fixes dockerd on minimal RHEL/AlmaLinux) Root cause of Asad's AlmaLinux 9 / EC2 failure: dockerd died on startup with "failed to register bridge driver: iptables ... addrtype ... missing kernel module". Minimal RHEL/AlmaLinux cloud images (incl. AWS) ship kernel-modules-core but NOT the full kernel-modules package, so xt_addrtype (+ br_netfilter, overlay) aren't available and Docker can't program its bridge NAT rules. New _ensure_kernel_modules() (setup-linux.sh), called before starting Docker: modprobe overlay / br_netfilter / xt_addrtype / iptable_nat / ip_tables; on RHEL-family, if a load fails, `dnf install kernel-modules-$(uname -r)` and retry; persist to /etc/modules-load.d for reboots. Best-effort + idempotent (verified clean on a healthy Ubuntu box). Also sharpened the daemon-won't-start hint to point at the kernel-modules remedy when the error mentions addrtype/missing module. This is the hospital-VM profile (minimal RHEL/Alma), so it's a real install-side fix, not just error handling. Test: setup-linux.bats _ensure_kernel_modules; bats 122. Co-Authored-By: Claude Opus 4.8 --- scripts/lib/setup-linux.sh | 29 +++++++++++++++++++++++++++-- scripts/tests/setup-linux.bats | 12 ++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/scripts/lib/setup-linux.sh b/scripts/lib/setup-linux.sh index 5c7c6a3..432f456 100755 --- a/scripts/lib/setup-linux.sh +++ b/scripts/lib/setup-linux.sh @@ -14,6 +14,26 @@ setup_pm() { else error "No supported package manager found."; fi } +# ── Kernel modules Docker + k3s need ───────────────────────────────────────── +# Docker's bridge driver programs iptables NAT rules using the `addrtype` match +# (xt_addrtype), and k3s needs br_netfilter + overlay. Minimal RHEL/AlmaLinux +# cloud images (e.g. AWS EC2) ship kernel-modules-core but NOT the full +# kernel-modules package that carries these, so dockerd dies on startup with +# "iptables … addrtype … missing kernel module". Load them — installing the +# matching kernel-modules package on RHEL-family if a load fails — and persist +# for reboots. Best-effort + idempotent; harmless where the modules already exist. +_ensure_kernel_modules() { + local mods="overlay br_netfilter xt_addrtype iptable_nat ip_tables" + local m missing="" + for m in $mods; do sudo modprobe "$m" 2>/dev/null || missing=1; done + if [[ -n "$missing" ]] && has dnf; then + spin_cmd "Installing kernel modules for Docker/k3s…" \ + sudo dnf install -y -q "kernel-modules-$(uname -r)" || true + for m in $mods; do sudo modprobe "$m" 2>/dev/null || true; done + fi + printf '%s\n' $mods | sudo tee /etc/modules-load.d/tracebloc.conf >/dev/null 2>&1 || true +} + # ── Docker Engine ──────────────────────────────────────────────────────────── install_docker_engine() { if ! has docker; then @@ -50,6 +70,10 @@ install_docker_engine() { success "Docker" fi + # Load the kernel modules dockerd's bridge driver + k3s need BEFORE starting, + # so minimal RHEL/AlmaLinux images don't fail with the "addrtype" iptables error. + _ensure_kernel_modules + # Clear any failed/throttled state from a previous attempt first — a crashed # daemon leaves the unit in "Start request repeated too quickly", which makes # systemctl refuse a plain start (so a bare re-run can never recover). Both @@ -70,8 +94,9 @@ install_docker_engine() { if ! sudo systemctl is-active --quiet docker 2>/dev/null; then echo "" warn "Docker is installed, but its daemon won't start — this is a Docker/host issue, not tracebloc." - hint "Common causes on RHEL/AlmaLinux: SELinux or iptables/nftables init, an overlay" - hint "storage-driver problem, or too little space on /var/lib/docker. Docker's own error:" + hint "If the error below mentions 'addrtype' / 'missing kernel module', the host lacks the" + hint "netfilter modules Docker needs — try: sudo dnf install -y kernel-modules-\$(uname -r) && sudo reboot" + hint "Other causes: SELinux, an overlay storage-driver issue, or low /var/lib/docker disk. Docker's error:" { sudo systemctl status docker.service --no-pager -l 2>&1 | tail -6 sudo journalctl -u docker.service --no-pager 2>/dev/null \ | grep -iE 'level=(error|fatal)|failed to|cannot |unable |no such' | tail -12; } | sed 's/^/ /' diff --git a/scripts/tests/setup-linux.bats b/scripts/tests/setup-linux.bats index 783c343..eac2bc1 100644 --- a/scripts/tests/setup-linux.bats +++ b/scripts/tests/setup-linux.bats @@ -150,3 +150,15 @@ setup() { [[ "$output" == *"daemon won't start"* ]] [[ "$output" != *"logging out"* ]] # the misleading group hint is NOT used } + +# Asad's root cause: minimal AlmaLinux lacks xt_addrtype -> dockerd bridge init fails. +@test "_ensure_kernel_modules: modprobes modules + installs kernel-modules on a load failure" { + has() { [[ "$1" == "dnf" ]]; } + sudo() { record "sudo $*"; case "$*" in *modprobe*) return 1 ;; esac; return 0; } + spin_cmd() { record "$*"; return 0; } + run _ensure_kernel_modules + run mock_calls + [[ "$output" == *"modprobe overlay"* ]] + [[ "$output" == *"modprobe xt_addrtype"* ]] + [[ "$output" == *"kernel-modules-"* ]] # RHEL fallback install fired +} From 4d4bb7f336c09011f01df62d6ed49bc112f542e8 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 09:45:33 +0200 Subject: [PATCH 07/13] ci(installer): cross-distro prereq matrix + real-Windows Pester + static gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds .github/workflows/installer-tests.yaml to validate the installer across the breadth of environments customers actually run — not just Ubuntu-amd64: • static — shellcheck (clean at --severity=warning) + bash -n + PSScriptAnalyzer • unit-bash — bats (mocked), 124 tests • unit-pester — Pester on Linux pwsh AND real windows-latest (the .ps1's true target) • distro-prereqs — NEW: runs the REAL Linux prereq path (PM detect, system deps, Docker branch, kernel modules, kubectl/k3d/helm) in a fresh container per distro family: ubuntu 22.04/24.04, debian 12, almalinux 9/8, rockylinux 9, amazonlinux 2023, fedora, opensuse leap. The matrix paid for itself before it even shipped: validating it locally against real distro containers surfaced a genuine gap — minimal Amazon Linux 2023 ships no openssl/tar, so helm's get-helm-3 fails ("openssl must first be installed"). Fixed in install_system_deps (ensure openssl + tar; package names are uniform across apt/dnf/yum/zypper/pacman), with bats coverage. All 9 validated distro branches now install every prerequisite. Installer-test jobs moved out of helm-ci.yaml into their own workflow (no more duplicate runs; helm-ci no longer triggers on scripts/** changes). Arch omitted (x86-only image + bare-container keyring friction; pacman branch covered by bats). Real k3d cluster-up (e2e) intentionally deferred — needs a stubbed backend; tracked. Validated locally via mac Docker: ubuntu:22.04, almalinux:9, amazonlinux:2023, opensuse/leap:15.6 → all PASS; bats 124 green; shellcheck 0 findings. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/helm-ci.yaml | 30 +----- .github/workflows/installer-tests.yaml | 130 +++++++++++++++++++++++++ scripts/lib/setup-linux.sh | 6 ++ scripts/tests/distro-prereqs.sh | 115 ++++++++++++++++++++++ scripts/tests/setup-linux.bats | 16 +++ 5 files changed, 270 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/installer-tests.yaml create mode 100755 scripts/tests/distro-prereqs.sh diff --git a/.github/workflows/helm-ci.yaml b/.github/workflows/helm-ci.yaml index 061a4b5..9abefb0 100644 --- a/.github/workflows/helm-ci.yaml +++ b/.github/workflows/helm-ci.yaml @@ -6,14 +6,12 @@ on: paths: - 'client/**' - 'ingestor/**' - - 'scripts/**' - '.github/workflows/helm-ci.yaml' pull_request: branches: [main, develop, openshift] paths: - 'client/**' - 'ingestor/**' - - 'scripts/**' - '.github/workflows/helm-ci.yaml' jobs: @@ -118,28 +116,6 @@ jobs: > /dev/null echo "Schema validation passed for ${{ matrix.platform }}" - installer-tests: - name: Installer script tests - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Install bats - run: sudo apt-get update -qq && sudo apt-get install -y -qq bats - - - name: bats unit tests (bash installer) - run: bats scripts/tests/*.bats - - - name: Pester unit tests (PowerShell installer) - shell: pwsh - env: - TB_PESTER: "1" - run: | - Set-PSRepository PSGallery -InstallationPolicy Trusted - Install-Module Pester -MinimumVersion 5.5.0 -Force -SkipPublisherCheck -Scope CurrentUser - Import-Module Pester -MinimumVersion 5.5.0 -Force - $cfg = New-PesterConfiguration - $cfg.Run.Path = "scripts/tests/install-k8s.Tests.ps1" - $cfg.Run.Exit = $true - $cfg.Output.Verbosity = "Detailed" - Invoke-Pester -Configuration $cfg + # Installer script tests (bats + Pester) + the cross-distro prerequisite matrix + # live in their own workflow: .github/workflows/installer-tests.yaml + # (triggered on scripts/** changes). diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml new file mode 100644 index 0000000..bab7b90 --- /dev/null +++ b/.github/workflows/installer-tests.yaml @@ -0,0 +1,130 @@ +name: Installer tests + +# Validates the curl/PowerShell installer (scripts/) across the breadth of +# environments a customer might actually have: +# • static — shellcheck + bash -n + PSScriptAnalyzer +# • unit-bash — bats (mocked) for the bash installer +# • unit-pester— Pester for the PowerShell installer, on Linux AND real Windows +# • distro-prereqs — runs the REAL Linux prerequisite-install path (package +# manager, system deps, Docker branch, kernel modules, kubectl/ +# k3d/helm) inside a fresh container for each major distro family. +# This is what catches "works on Ubuntu, breaks on minimal RHEL" +# bugs that mocked unit tests can't see. +on: + push: + branches: [main, develop, openshift] + paths: + - 'scripts/**' + - '.github/workflows/installer-tests.yaml' + pull_request: + branches: [main, develop, openshift] + paths: + - 'scripts/**' + - '.github/workflows/installer-tests.yaml' + schedule: + - cron: '0 3 * * 1' # Mondays 03:00 UTC — catch drift as distro base images move + workflow_dispatch: + +permissions: + contents: read + +jobs: + static: + name: Static analysis + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: bash -n (syntax) on every script + run: | + find scripts -type f \( -name '*.sh' -o -name '*.bats' \) -print0 \ + | while IFS= read -r -d '' f; do bash -n "$f" || exit 1; done + echo "all scripts parse" + + - name: ShellCheck (libs + entrypoints) + run: | + sudo apt-get update -qq && sudo apt-get install -y -qq shellcheck + shellcheck --version | grep version + shellcheck --severity=warning --shell=bash \ + scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ + scripts/tests/distro-prereqs.sh + + - name: PSScriptAnalyzer (PowerShell installer) + shell: pwsh + run: | + Set-PSRepository PSGallery -InstallationPolicy Trusted + Install-Module PSScriptAnalyzer -Force -SkipPublisherCheck -Scope CurrentUser + $issues = Invoke-ScriptAnalyzer -Path scripts/install-k8s.ps1 -Severity Error,Warning + if ($issues) { $issues | Format-Table -AutoSize } + $errs = @($issues | Where-Object { $_.Severity -eq 'Error' }) + if ($errs.Count -gt 0) { Write-Error "PSScriptAnalyzer: $($errs.Count) error(s)"; exit 1 } + Write-Host "no PSScriptAnalyzer errors" + + unit-bash: + name: bats (bash unit, mocked) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install bats + run: sudo apt-get update -qq && sudo apt-get install -y -qq bats + - name: Run bats + run: bats scripts/tests/*.bats + + unit-pester: + # Pester on Linux pwsh (fast) AND real Windows — the .ps1 installer's actual + # target. fail-fast:false so a Windows-only surprise doesn't mask Linux signal. + name: Pester (${{ matrix.os }}) + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Run Pester + shell: pwsh + env: + TB_PESTER: "1" + run: | + Set-PSRepository PSGallery -InstallationPolicy Trusted + Install-Module Pester -MinimumVersion 5.5.0 -Force -SkipPublisherCheck -Scope CurrentUser + Import-Module Pester -MinimumVersion 5.5.0 -Force + $cfg = New-PesterConfiguration + $cfg.Run.Path = "scripts/tests/install-k8s.Tests.ps1" + $cfg.Run.Exit = $true + $cfg.Output.Verbosity = "Detailed" + Invoke-Pester -Configuration $cfg + + distro-prereqs: + # Runs the installer's REAL Linux prerequisite path in a fresh container for + # each distro family. Proves the package-manager / Docker / conntrack / helm + # branches all resolve and install — the layer where every installer bug we + # have shipped lived (#718 PATH, #719 RHEL docker-ce, #720 conntrack-tools, + # the AlmaLinux kernel-modules gap, the Amazon Linux openssl/tar gap). + # It does NOT start dockerd or create a cluster — that needs a real kernel + + # systemd, covered by local VMs and (optionally) a future e2e job. Arch is + # omitted: its official image is x86-only and bare containers need keyring + # bootstrapping; the pacman branch is covered by the bats unit test. + name: Prereqs — ${{ matrix.distro }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + distro: + - 'ubuntu:22.04' # apt / get.docker.com — most common server + - 'ubuntu:24.04' # newest LTS + - 'debian:12' # apt + - 'almalinux:9' # dnf + docker-ce repo (RHEL rebuild, #719) + - 'almalinux:8' # older RHEL rebuild + - 'rockylinux:9' # the other RHEL rebuild + - 'amazonlinux:2023' # dnf + 'docker' pkg — common AWS default + - 'fedora:latest' # dnf, falls through to get.docker.com + - 'opensuse/leap:15.6' # zypper + steps: + - uses: actions/checkout@v4 + - name: Install prerequisites in ${{ matrix.distro }} + env: + DISTRO: ${{ matrix.distro }} + run: | + docker run --rm -v "$PWD:/src:ro" -w /src "$DISTRO" \ + bash scripts/tests/distro-prereqs.sh diff --git a/scripts/lib/setup-linux.sh b/scripts/lib/setup-linux.sh index 432f456..bc42398 100755 --- a/scripts/lib/setup-linux.sh +++ b/scripts/lib/setup-linux.sh @@ -117,6 +117,12 @@ install_system_deps() { MISSING_PKGS=() has curl || MISSING_PKGS+=(curl) has conntrack || MISSING_PKGS+=("$conntrack_pkg") + # helm's get-helm-3 verifies its download checksum with openssl and unpacks a + # tarball with tar; minimal cloud images (Amazon Linux 2023, minimal RHEL) ship + # neither, so the Helm install fails. Ensure both (package names are uniform + # across apt/dnf/yum/zypper/pacman, unlike conntrack). + has openssl || MISSING_PKGS+=(openssl) + has tar || MISSING_PKGS+=(tar) if [[ ${#MISSING_PKGS[@]} -gt 0 ]]; then spin_cmd "Updating package index…" $PM_UPDATE for pkg in "${MISSING_PKGS[@]}"; do diff --git a/scripts/tests/distro-prereqs.sh b/scripts/tests/distro-prereqs.sh new file mode 100755 index 0000000..6458895 --- /dev/null +++ b/scripts/tests/distro-prereqs.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# ============================================================================= +# distro-prereqs.sh — cross-distro prerequisite-install smoke test +# ----------------------------------------------------------------------------- +# Runs the installer's REAL Linux prerequisite logic inside a fresh distro +# container and asserts every prerequisite binary lands on PATH: +# +# setup_pm → correct package manager detected for this distro +# install_system_deps → conntrack installed under the right package name (#720) +# install_docker_engine → correct Docker branch taken (get.docker.com vs the +# docker-ce repo for RHEL rebuilds #719, dnf/yum/zypper/ +# pacman), Docker package actually installed +# _ensure_kernel_modules → netfilter modules loaded / kernel-modules fallback +# (Asad's AlmaLinux xt_addrtype case) — best-effort +# install_kubectl / install_k3d (PATH-through-sudo #718) / install_helm +# +# It deliberately does NOT start the Docker daemon or create a k3d cluster — +# that needs a real kernel + systemd (covered by the e2e job on the Ubuntu +# runners and by the local Lima/VM matrix). This proves each distro's BRANCH +# does the right thing, which is where every installer bug we have shipped lived. +# +# Usage (inside a container, as root): +# bash scripts/tests/distro-prereqs.sh +# Typically driven by CI: +# docker run --rm -v "$PWD:/src:ro" -w /src bash scripts/tests/distro-prereqs.sh +# ============================================================================= +set -uo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIB="$HERE/../lib" + +# ── Make the container resemble a real host ────────────────────────────────── +# Anyone running the real installer reached it via `curl | bash`, so curl always +# exists and the box has sudo. Minimal base images ship neither — install them +# up front (we are root here) so the rest of the run mirrors a real machine. +_pm_install_one() { # install a single package with whatever PM exists + if command -v apt-get >/dev/null 2>&1; then apt-get update -qq && apt-get install -y -qq "$1" + elif command -v dnf >/dev/null 2>&1; then dnf install -y -q "$1" + elif command -v yum >/dev/null 2>&1; then yum install -y -q "$1" + elif command -v zypper >/dev/null 2>&1; then zypper --non-interactive install "$1" + elif command -v pacman >/dev/null 2>&1; then pacman -Sy --noconfirm "$1" + fi +} +_bootstrap_host() { + command -v curl >/dev/null 2>&1 && command -v sudo >/dev/null 2>&1 && return 0 + echo "── bootstrapping curl + sudo ──" + # Install sudo and curl independently. The installer calls `sudo` for every + # privileged step (a real host has it); minimal images may not. Only add curl + # if the binary is truly absent — RHEL 9 ships curl-minimal, which provides + # curl, and `dnf install curl` would hit the curl/curl-minimal conflict. + command -v sudo >/dev/null 2>&1 || _pm_install_one sudo + command -v curl >/dev/null 2>&1 || _pm_install_one curl +} +_bootstrap_host + +# shellcheck source=/dev/null +source "$LIB/common.sh" +# shellcheck source=/dev/null +source "$LIB/setup-linux.sh" + +# The real entrypoint runs validate_config first, which guarantees $USER is set +# (usermod -aG docker "$USER" runs under `set -u`). Containers often don't export +# USER — mirror that precondition so we test the install path, not a missing env. +export USER="${USER:-$(id -un)}" + +# ── Context banner ─────────────────────────────────────────────────────────── +PRETTY="$( . /etc/os-release 2>/dev/null && echo "${PRETTY_NAME:-unknown}" )" +echo "" +echo "═══════════════════════════════════════════════════════════════════════" +echo " distro : ${PRETTY}" +echo " arch : $(uname -m) (ARCH_DL=${ARCH_DL})" +echo " kernel : $(uname -r)" +echo "═══════════════════════════════════════════════════════════════════════" + +# umask 077 (set by common.sh) would make /usr/local/bin tools root-exec-only; +# install_linux relaxes to 022 around the tool installs — mirror that here. +umask 022 + +# ── Run the real prereq path ───────────────────────────────────────────────── +setup_pm +echo "→ PM_INSTALL = ${PM_INSTALL}" + +install_system_deps + +# install_docker_engine installs the Docker package via the distro-specific +# branch, then gates on a *running* daemon — which cannot come up without +# systemd/a real kernel in a bare container. Tolerate that final gate; we only +# assert the binary was installed. (Daemon start-up is covered by the e2e job.) +echo "→ installing Docker (daemon start-up gate is expected to be skipped here)…" +( install_docker_engine ) || echo " (docker daemon gate skipped — expected in a container)" + +install_kubectl +install_k3d +install_helm + +# ── Assertions ─────────────────────────────────────────────────────────────── +echo "" +echo "── prerequisite check ─────────────────────────────────────────────────" +fail=0 +for tool in docker kubectl k3d helm conntrack; do + if path="$(command -v "$tool" 2>/dev/null)"; then + ver="$("$tool" --version 2>/dev/null | head -1 || true)" + printf ' ✔ %-9s %s %s\n' "$tool" "$path" "${ver:-}" + else + printf ' ✖ %-9s MISSING\n' "$tool" + fail=1 + fi +done +echo "───────────────────────────────────────────────────────────────────────" + +if [[ $fail -ne 0 ]]; then + echo "RESULT: FAIL — a prerequisite did not install on ${PRETTY}" + exit 1 +fi +echo "RESULT: PASS — all prerequisites installed on ${PRETTY}" diff --git a/scripts/tests/setup-linux.bats b/scripts/tests/setup-linux.bats index eac2bc1..d9e22c4 100644 --- a/scripts/tests/setup-linux.bats +++ b/scripts/tests/setup-linux.bats @@ -71,6 +71,22 @@ setup() { run mock_calls [[ "$output" != *"Installing conntrack"* ]] } +# Caught by the cross-distro CI matrix on Amazon Linux 2023: helm's get-helm-3 +# needs openssl (checksum) + tar (unpack), absent on minimal cloud images. +@test "install_system_deps: ensures openssl + tar (helm needs them on minimal images)" { + PRESENT_CMDS="dnf curl conntrack" # openssl + tar absent + run install_system_deps + run mock_calls + [[ "$output" == *"Installing openssl"* ]] + [[ "$output" == *"Installing tar"* ]] +} +@test "install_system_deps: openssl + tar already present -> not reinstalled" { + PRESENT_CMDS="apt-get curl conntrack openssl tar" + run install_system_deps + run mock_calls + [[ "$output" != *"Installing openssl"* ]] + [[ "$output" != *"Installing tar"* ]] +} # ── install_docker_engine: branch selection ──────────────────────────────── @test "install_docker_engine: Amazon Linux -> dnf docker" { From d8c06301236329544ca10f3cade77c1e72a455fe Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 09:50:59 +0200 Subject: [PATCH 08/13] =?UTF-8?q?ci(installer):=20fix=20static=20gate=20(.?= =?UTF-8?q?bats=20=E2=89=A0=20bash)=20+=20Windows-safe=20Confirm-Config=20?= =?UTF-8?q?test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new workflow's first run caught two issues — exactly its job: 1. Static analysis failed: `bash -n` ran on .bats files, which are bats DSL (@test "name" { … }), not valid bash. Restrict the syntax check to *.sh; .bats are validated by being run in the unit-bash job. 2. Pester on real windows-latest failed 1/55: the Confirm-Config test set $env:USERPROFILE = $env:HOME, but $env:HOME is empty on Windows, so [System.IO.Path]::GetFullPath("") threw "path is empty". The INSTALLER is correct (defaults to $env:USERPROFILE, always set on Windows) — the test fixture was Linux-centric. Derive a profile dir valid on both OSes. For the record, the first run's wins: all 9 distro prereq jobs (ubuntu 22.04/ 24.04, debian 12, almalinux 8/9, rockylinux 9, amazonlinux 2023, fedora, opensuse leap) + bats + Linux Pester passed on GHA's amd64 runners. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/installer-tests.yaml | 9 ++++++--- scripts/tests/install-k8s.Tests.ps1 | 12 ++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml index bab7b90..9a20fae 100644 --- a/.github/workflows/installer-tests.yaml +++ b/.github/workflows/installer-tests.yaml @@ -35,11 +35,14 @@ jobs: steps: - uses: actions/checkout@v4 - - name: bash -n (syntax) on every script + - name: bash -n (syntax) on every shell script run: | - find scripts -type f \( -name '*.sh' -o -name '*.bats' \) -print0 \ + # .bats files are bats DSL (@test "name" { … }), not valid bash — they are + # syntax-checked by actually running them in the unit-bash job. Parse the + # real shell scripts here. + find scripts -type f -name '*.sh' -print0 \ | while IFS= read -r -d '' f; do bash -n "$f" || exit 1; done - echo "all scripts parse" + echo "all shell scripts parse" - name: ShellCheck (libs + entrypoints) run: | diff --git a/scripts/tests/install-k8s.Tests.ps1 b/scripts/tests/install-k8s.Tests.ps1 index a092fad..ed3967c 100644 --- a/scripts/tests/install-k8s.Tests.ps1 +++ b/scripts/tests/install-k8s.Tests.ps1 @@ -118,14 +118,18 @@ Describe "Get-WindowsArch" { Describe "Confirm-Config" { It "valid config passes + sets HOST_DATA_DIR" { - $env:USERPROFILE = $env:HOME - $CLUSTER_NAME = "tracebloc"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = "$env:HOME/.tracebloc" + # $env:HOME is empty on Windows (it uses USERPROFILE) — derive a profile dir + # valid on both OSes, else GetFullPath in Confirm-Config throws "path is empty". + $prof = if ($env:USERPROFILE) { $env:USERPROFILE } elseif ($env:HOME) { $env:HOME } else { [System.IO.Path]::GetTempPath() } + $env:USERPROFILE = $prof + $CLUSTER_NAME = "tracebloc"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = Join-Path $prof ".tracebloc" { Confirm-Config } | Should -Not -Throw } It "invalid CLUSTER_NAME -> Err" { Mock Err { throw "err" } - $env:USERPROFILE = $env:HOME - $CLUSTER_NAME = "1bad"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = "$env:HOME/x" + $prof = if ($env:USERPROFILE) { $env:USERPROFILE } elseif ($env:HOME) { $env:HOME } else { [System.IO.Path]::GetTempPath() } + $env:USERPROFILE = $prof + $CLUSTER_NAME = "1bad"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = Join-Path $prof "x" { Confirm-Config } | Should -Throw } } From 5543fa4592c6d2771535fc8059d69e36238d1087 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 09:55:40 +0200 Subject: [PATCH 09/13] ci(installer): gate ShellCheck at error severity (SC2034 cross-file false positives) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The libs are sourced together as one program, so single-file shellcheck reports SC2034 "unused" for shared vars defined in common.sh and consumed in other sourced files (CURL_SECURE, ARCH_DL, colours…). Gate at --severity=error (0 findings); warnings still printed as advisory. With this, Static analysis joins the already-green distro matrix + Windows/Linux Pester + bats. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/installer-tests.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml index 9a20fae..82e4b8d 100644 --- a/.github/workflows/installer-tests.yaml +++ b/.github/workflows/installer-tests.yaml @@ -48,9 +48,18 @@ jobs: run: | sudo apt-get update -qq && sudo apt-get install -y -qq shellcheck shellcheck --version | grep version - shellcheck --severity=warning --shell=bash \ + # Gate at error severity. The libs are sourced together as one program, + # so single-file shellcheck reports SC2034 "unused" false positives for + # shared vars (CURL_SECURE, ARCH_DL, colours…) that are defined in + # common.sh and consumed in other sourced files. Warnings are printed + # below for visibility but don't fail the gate. + shellcheck --severity=error --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ scripts/tests/distro-prereqs.sh + echo "── shellcheck warnings (advisory, non-blocking) ──" + shellcheck --severity=warning --shell=bash \ + scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ + scripts/tests/distro-prereqs.sh || true - name: PSScriptAnalyzer (PowerShell installer) shell: pwsh From fe57d5eb05efaea99ac888e0b465bdcb01d59db3 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 10:06:00 +0200 Subject: [PATCH 10/13] ci(installer): real k3d cluster-up E2E on Ubuntu (amd64 + arm64) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds scripts/tests/e2e-cluster.sh + an e2e-cluster matrix job — the highest- fidelity check CI can run. It drives the installer's OWN create_cluster() to bring up an actual k3d cluster on a real kernel (Docker is preinstalled on the runner), asserts every node reaches Ready, then proves the cluster can pull, schedule, and run a public workload (nginx:alpine), and tears down. It deliberately stops BEFORE the tracebloc helm install / backend registration (private images + real credentials), so it needs no secrets. Runs on ubuntu-22.04, ubuntu-24.04, and ubuntu-24.04-arm (arm64 runners are free on this public repo) — covering the real cluster path on both architectures. Validated locally on an arm64 Ubuntu VM: create_cluster() → server+agent Ready (k3s v1.29.4) → nginx pod Running → teardown. shellcheck clean (0 errors). Co-Authored-By: Claude Opus 4.8 --- .github/workflows/installer-tests.yaml | 22 ++++++++- scripts/tests/e2e-cluster.sh | 65 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 scripts/tests/e2e-cluster.sh diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml index 82e4b8d..fab0510 100644 --- a/.github/workflows/installer-tests.yaml +++ b/.github/workflows/installer-tests.yaml @@ -55,11 +55,11 @@ jobs: # below for visibility but don't fail the gate. shellcheck --severity=error --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh + scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh echo "── shellcheck warnings (advisory, non-blocking) ──" shellcheck --severity=warning --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh || true + scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh || true - name: PSScriptAnalyzer (PowerShell installer) shell: pwsh @@ -140,3 +140,21 @@ jobs: run: | docker run --rm -v "$PWD:/src:ro" -w /src "$DISTRO" \ bash scripts/tests/distro-prereqs.sh + + e2e-cluster: + # Highest-fidelity check CI can run: brings up an ACTUAL k3d cluster via the + # installer's own create_cluster() on a real kernel (Docker is preinstalled + # on the runner), proves it can schedule + run a public workload, then tears + # down. Stops BEFORE the tracebloc helm install / backend registration (those + # need private images + real credentials), so it needs no secrets. Runs on + # both amd64 and arm64 Ubuntu runners (arm64 is free on this public repo). + name: E2E cluster (${{ matrix.os }}) + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, ubuntu-24.04, ubuntu-24.04-arm] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Bring up a real k3d cluster + run a workload + run: bash scripts/tests/e2e-cluster.sh diff --git a/scripts/tests/e2e-cluster.sh b/scripts/tests/e2e-cluster.sh new file mode 100644 index 0000000..2bb80e5 --- /dev/null +++ b/scripts/tests/e2e-cluster.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# ============================================================================= +# e2e-cluster.sh — real end-to-end cluster smoke test +# ----------------------------------------------------------------------------- +# Brings up an ACTUAL k3d cluster on a real kernel using the installer's own +# create_cluster() path (the same function main() calls), proves the cluster can +# schedule and run a public workload, then tears it down. This is the highest- +# fidelity check CI can run: it exercises k3d cluster create, the proxy/NO_PROXY +# config, kubeconfig merge, and the API-readiness wait against a live daemon — +# none of which the mocked unit tests or the prereq-install matrix can. +# +# It deliberately STOPS before the tracebloc helm install / backend +# registration: those pull private images and need real credentials + a +# reachable platform. So this needs no secrets and runs on stock GitHub runners +# (Docker is preinstalled) and locally (Lima/any Docker host). +# +# Usage: bash scripts/tests/e2e-cluster.sh +# ============================================================================= +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIB="$HERE/../lib" + +# Isolated cluster name so we never touch a real 'tracebloc' cluster; opt out of +# autostart so we don't reconfigure docker.service / restart policies on the host. +export USER="${USER:-$(id -un)}" +export CLUSTER_NAME="${CLUSTER_NAME:-tbe2e}" +export TRACEBLOC_NO_AUTOSTART=1 + +# shellcheck source=/dev/null +source "$LIB/common.sh" +# shellcheck source=/dev/null +source "$LIB/setup-linux.sh" +# shellcheck source=/dev/null +source "$LIB/cluster.sh" + +cleanup() { k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true; } +trap cleanup EXIT + +echo "═══════════════════════════════════════════════════════════════════════" +echo " E2E cluster smoke arch: $(uname -m) kernel: $(uname -r)" +echo "═══════════════════════════════════════════════════════════════════════" + +# Docker is preinstalled + running on the runner; we only need the CLI tools the +# cluster step uses. (We do NOT run install_docker_engine — no daemon gymnastics.) +has docker || error "Docker is not available on this host." +umask 022 +install_kubectl +install_k3d +install_helm + +echo "── create_cluster() — the installer's real cluster-bring-up path ──" +create_cluster + +echo "── assert: all nodes reach Ready ──" +kubectl wait --for=condition=Ready nodes --all --timeout=180s +kubectl get nodes -o wide + +echo "── assert: the cluster can pull, schedule, and run a public workload ──" +kubectl run e2e-probe --image=nginx:alpine --restart=Never +kubectl wait --for=condition=Ready pod/e2e-probe --timeout=180s +kubectl get pods -o wide + +echo "" +echo "E2E PASS: k3d cluster came up via the installer's create_cluster() and ran a workload." From 129b1fed0b1ebc33173ee51b3aa1ade3563abbdc Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 10:09:17 +0200 Subject: [PATCH 11/13] ci(installer): fix E2E probe race on the default ServiceAccount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The amd64 runners proved the cluster comes up fine (nodes Ready), but the probe pod failed with "serviceaccount default not found" — kubectl run raced the SA controller, which creates default/default asynchronously after the node goes Ready. arm64 dodged it by timing. Wait for the SA before running the pod. Pure test-harness fix; the installer cluster path is correct on all arches. Co-Authored-By: Claude Opus 4.8 --- scripts/tests/e2e-cluster.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/tests/e2e-cluster.sh b/scripts/tests/e2e-cluster.sh index 2bb80e5..d5b80f2 100644 --- a/scripts/tests/e2e-cluster.sh +++ b/scripts/tests/e2e-cluster.sh @@ -56,6 +56,14 @@ echo "── assert: all nodes reach Ready ──" kubectl wait --for=condition=Ready nodes --all --timeout=180s kubectl get nodes -o wide +echo "── wait for the default ServiceAccount (created async after node Ready) ──" +# kubectl run binds the pod to default/default; on fast runners that can race the +# service-account controller ("serviceaccount default not found"). Wait for it. +for _ in $(seq 1 30); do + kubectl get serviceaccount default -n default >/dev/null 2>&1 && break + sleep 2 +done + echo "── assert: the cluster can pull, schedule, and run a public workload ──" kubectl run e2e-probe --image=nginx:alpine --restart=Never kubectl wait --for=condition=Ready pod/e2e-probe --timeout=180s From c19b3116703e12b8429a85cc01d01c848a352c1c Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 10:49:04 +0200 Subject: [PATCH 12/13] ci(installer): authenticated corporate-proxy E2E (squid) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds scripts/tests/e2e-proxy.sh + an e2e-proxy job. Stands up a squid that REQUIRES basic auth, brings up a k3d cluster via the installer's create_cluster() with HTTP(S)_PROXY=http://user:pass@host.k3d.internal:3128, and proves the nodes pull a workload image THROUGH the authed proxy — the squid access log shows an authenticated CONNECT to auth.docker.io (which only a real image pull makes, never the readiness probe), closing the "proxy silently bypassed" false positive. It also asserts anonymous requests are refused, so auth is genuinely enforced. Guards the corporate-proxy hardening end-to-end (#172/#174, the Charité/hospital archetype): _write_k3d_proxy_config passes proxy env via a k3d config FILE so the '@' in user:pass@host survives (k3d splits --env on '@'), plus _augment_no_proxy. If the credentials regress, squid 407s and the pull hangs — the test fails loudly. Stops before the helm install / backend registration; no secrets. Validated locally on an arm64 Ubuntu VM: anonymous refused → cluster up via the authed proxy → nginx pulled through it (auth.docker.io + registry-1.docker.io CONNECTs by the proxy user) → teardown. shellcheck clean. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/installer-tests.yaml | 18 +++- scripts/tests/e2e-proxy.sh | 137 +++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 scripts/tests/e2e-proxy.sh diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml index fab0510..b9f841f 100644 --- a/.github/workflows/installer-tests.yaml +++ b/.github/workflows/installer-tests.yaml @@ -55,11 +55,11 @@ jobs: # below for visibility but don't fail the gate. shellcheck --severity=error --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh + scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh echo "── shellcheck warnings (advisory, non-blocking) ──" shellcheck --severity=warning --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh || true + scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh || true - name: PSScriptAnalyzer (PowerShell installer) shell: pwsh @@ -158,3 +158,17 @@ jobs: - uses: actions/checkout@v4 - name: Bring up a real k3d cluster + run a workload run: bash scripts/tests/e2e-cluster.sh + + e2e-proxy: + # Authenticated corporate-proxy E2E (the Charité/hospital archetype): stands + # up a squid that REQUIRES basic auth, brings the cluster up with the + # installer's proxy config pointed at it as user:pass@host, and proves the + # nodes pull a workload image THROUGH the authed proxy (the squid log shows an + # authenticated auth.docker.io CONNECT, which only a real pull makes). Guards + # the #172/#174 proxy hardening end-to-end. Single runner (arch-agnostic). No secrets. + name: E2E auth-proxy (squid) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Cluster up through an authenticated proxy + run: bash scripts/tests/e2e-proxy.sh diff --git a/scripts/tests/e2e-proxy.sh b/scripts/tests/e2e-proxy.sh new file mode 100644 index 0000000..aaf55b0 --- /dev/null +++ b/scripts/tests/e2e-proxy.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# ============================================================================= +# e2e-proxy.sh — authenticated corporate-proxy end-to-end test +# ----------------------------------------------------------------------------- +# Stands up a real squid proxy that REQUIRES basic auth, then brings up a k3d +# cluster via the installer's create_cluster() with HTTP(S)_PROXY pointed at it +# as http://user:pass@host — and proves the cluster's nodes pull a workload +# image THROUGH the authenticated proxy. +# +# This exercises the corporate-proxy hardening end-to-end (the Charité/hospital +# archetype): _write_k3d_proxy_config (passes proxy env via a k3d CONFIG FILE so +# the '@' in user:pass@host survives — k3d splits --env on '@') + _augment_no_proxy +# (so in-cluster traffic bypasses the proxy and `--wait` doesn't hang). +# +# If the credentials get mangled, squid answers 407, the image pull hangs, and +# the pod never goes Ready — so this test fails loudly on a proxy-auth regression. +# It stops before the tracebloc helm install / backend registration (no secrets). +# +# Usage: bash scripts/tests/e2e-proxy.sh +# ============================================================================= +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIB="$HERE/../lib" + +export USER="${USER:-$(id -un)}" +export CLUSTER_NAME="${CLUSTER_NAME:-tbproxy}" +export TRACEBLOC_NO_AUTOSTART=1 + +PROXY_USER="tbuser" +PROXY_PASS="tb-Pass.123" # contains no '@', but the URL form does: user:pass@host +PROXY_PORT="3128" +SQUID_NAME="tb-squid" +WORK="$(mktemp -d)" + +# shellcheck source=/dev/null +source "$LIB/common.sh" +# shellcheck source=/dev/null +source "$LIB/setup-linux.sh" +# shellcheck source=/dev/null +source "$LIB/cluster.sh" + +cleanup() { + k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true + docker rm -f "$SQUID_NAME" >/dev/null 2>&1 || true + rm -rf "$WORK" +} +trap cleanup EXIT + +echo "═══════════════════════════════════════════════════════════════════════" +echo " Authenticated-proxy E2E arch: $(uname -m)" +echo "═══════════════════════════════════════════════════════════════════════" + +has docker || error "Docker is not available on this host." + +# Install the CLI tools directly (the proxy below is exercised by the cluster +# NODES, which is where the auth-proxy hardening lives). +umask 022 +install_kubectl +install_k3d +install_helm + +# ── 1. squid that REQUIRES basic auth ─────────────────────────────────────── +echo "── starting an authenticated squid proxy ──" +printf '%s:%s\n' "$PROXY_USER" "$(openssl passwd -apr1 "$PROXY_PASS")" > "$WORK/passwords" +cat > "$WORK/squid.conf" <<'EOF' +auth_param basic program /usr/lib/squid/basic_ncsa_auth /etc/squid/passwords +auth_param basic realm tracebloc-test-proxy +acl authed proxy_auth REQUIRED +acl SSL_ports port 443 +acl CONNECT method CONNECT +http_access deny CONNECT !SSL_ports +http_access allow authed +http_access deny all +http_port 3128 +EOF +docker rm -f "$SQUID_NAME" >/dev/null 2>&1 || true +docker run -d --name "$SQUID_NAME" -p "${PROXY_PORT}:3128" \ + -v "$WORK/squid.conf:/etc/squid/squid.conf:ro" \ + -v "$WORK/passwords:/etc/squid/passwords:ro" \ + ubuntu/squid:latest >/dev/null + +echo "── waiting for squid + verifying auth is enforced ──" +ready="" +for _ in $(seq 1 30); do + # A correctly-authenticated CONNECT to a registry should tunnel (curl exit 0); + # squid returns 407 (curl exit 56/22) if auth is wrong or not yet up. + # No -f: the registry answers 401 (needs a token) even on a healthy tunnel; we + # only care that the proxy TUNNELED the request (curl exit 0) vs refused with + # 407 (curl non-zero). -o /dev/null discards the body. + if curl -sS -m 8 -x "http://${PROXY_USER}:${PROXY_PASS}@127.0.0.1:${PROXY_PORT}" \ + https://registry-1.docker.io/v2/ -o /dev/null 2>/dev/null; then + ready=1; break + fi + sleep 2 +done +[[ -n "$ready" ]] || error "squid did not become ready / auth check failed." +# Prove auth is actually ENFORCED: a request with NO credentials must be refused. +if curl -sS -m 8 -x "http://127.0.0.1:${PROXY_PORT}" https://registry-1.docker.io/v2/ -o /dev/null 2>/dev/null; then + error "Proxy allowed an unauthenticated request — auth not enforced; test is invalid." +fi +success "Authenticated squid proxy up (anonymous requests refused)." + +# ── 2. bring up the cluster with the nodes pointed at the AUTHED proxy ─────── +# Nodes reach the host's published squid via host.k3d.internal (k3d injects it). +# The user:pass@host form is the exact shape the #174 fix protects. +export HTTP_PROXY="http://${PROXY_USER}:${PROXY_PASS}@host.k3d.internal:${PROXY_PORT}" +export HTTPS_PROXY="$HTTP_PROXY" +echo "── create_cluster() with HTTP(S)_PROXY=http://${PROXY_USER}:***@host.k3d.internal:${PROXY_PORT} ──" +create_cluster +kubectl wait --for=condition=Ready nodes --all --timeout=180s + +echo "── wait for the default ServiceAccount (created async after node Ready) ──" +for _ in $(seq 1 30); do + kubectl get serviceaccount default -n default >/dev/null 2>&1 && break + sleep 2 +done + +echo "── pull + run a public workload — the node must fetch it THROUGH the proxy ──" +kubectl run e2e-probe --image=nginx:alpine --restart=Never +kubectl wait --for=condition=Ready pod/e2e-probe --timeout=180s +kubectl get pods -o wide + +# ── 3. prove the node's image pull actually traversed the AUTHED proxy ─────── +echo "── squid access log: the node's authenticated image-pull traffic ──" +plog="$(docker exec "$SQUID_NAME" cat /var/log/squid/access.log 2>/dev/null || true)" +echo "$plog" | grep -E 'CONNECT' | grep "$PROXY_USER" | grep -E 'docker' | tail -8 | sed 's/^/ /' +# auth.docker.io is fetched only by a real image pull (the node getting a pull +# token) — never by the readiness probe to /v2/, which stops at the 401. So an +# authenticated CONNECT to it proves the NODE pulled through the proxy (not just +# the host's readiness check), closing the "proxy silently ignored" false-positive. +if ! echo "$plog" | grep -E 'CONNECT .*auth\.docker\.io' | grep -q "$PROXY_USER"; then + error "No authenticated auth.docker.io CONNECT in the proxy log — the node's image pull did not traverse the proxy." +fi + +echo "" +echo "E2E PASS: cluster came up via an AUTHENTICATED proxy and pulled a workload through it." From d97e20daf68703b885692d9d4f7b123af4f81bfb Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Tue, 2 Jun 2026 14:41:48 +0500 Subject: [PATCH 13/13] fix(installer): install kernel-modules-extra + handle reboot-required (#176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dockerd crash-loops on minimal RHEL/AlmaLinux images because xt_addrtype/ iptable_nat/br_netfilter live in kernel-modules-extra, not the base kernel-modules package. The prior fix installed kernel-modules-$(uname -r) — the wrong package — so the self-heal never took. Install kernel-modules-extra (unversioned). When the repo's extra modules target a newer kernel than the running one (stale AMI), they can't load until reboot: detect that, set KMODS_REBOOT_REQUIRED, and have install_docker_engine print a clear reboot-and-re-run message instead of a raw Docker error. Modules persist via /etc/modules-load.d/tracebloc.conf. Verified end-to-end on a pristine AlmaLinux 10.1 MINIMAL EC2 box: reboot gate fires, post-reboot modules load, re-run reaches Connected. Co-Authored-By: Claude Opus 4.7 --- scripts/lib/setup-linux.sh | 47 ++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/scripts/lib/setup-linux.sh b/scripts/lib/setup-linux.sh index bc42398..6af7618 100755 --- a/scripts/lib/setup-linux.sh +++ b/scripts/lib/setup-linux.sh @@ -16,22 +16,41 @@ setup_pm() { # ── Kernel modules Docker + k3s need ───────────────────────────────────────── # Docker's bridge driver programs iptables NAT rules using the `addrtype` match -# (xt_addrtype), and k3s needs br_netfilter + overlay. Minimal RHEL/AlmaLinux -# cloud images (e.g. AWS EC2) ship kernel-modules-core but NOT the full -# kernel-modules package that carries these, so dockerd dies on startup with -# "iptables … addrtype … missing kernel module". Load them — installing the -# matching kernel-modules package on RHEL-family if a load fails — and persist -# for reboots. Best-effort + idempotent; harmless where the modules already exist. +# (xt_addrtype), and k3s needs br_netfilter + overlay. On minimal RHEL/AlmaLinux +# cloud images (e.g. AWS EC2) these netfilter modules ship in kernel-modules-EXTRA, +# which is NOT installed by default (the base kernel-modules package does NOT +# carry xt_addrtype/iptable_nat/br_netfilter) — so dockerd dies on startup with +# "iptables … addrtype … missing kernel module". Install kernel-modules-extra, +# (re)load the modules, and persist them for reboots. Best-effort + idempotent. +# +# Caveat: kernel-modules-extra is only published for the repo's CURRENT kernel. +# If the running kernel is older (image hasn't been rebooted into the latest +# kernel yet), dnf installs the modules for the NEW kernel and they can't be +# modprobe'd until a reboot. We flag that (KMODS_REBOOT_REQUIRED) so the caller +# can tell the user to reboot + re-run; the modules-load.d entry then activates +# them on boot. _ensure_kernel_modules() { local mods="overlay br_netfilter xt_addrtype iptable_nat ip_tables" local m missing="" for m in $mods; do sudo modprobe "$m" 2>/dev/null || missing=1; done if [[ -n "$missing" ]] && has dnf; then + # The netfilter modules live in kernel-modules-extra, NOT the base + # kernel-modules package. Install unversioned so dnf pulls the extra set + # (and a matching newer kernel, if the repo has moved on) for the current repo. spin_cmd "Installing kernel modules for Docker/k3s…" \ - sudo dnf install -y -q "kernel-modules-$(uname -r)" || true - for m in $mods; do sudo modprobe "$m" 2>/dev/null || true; done + sudo dnf install -y -q kernel-modules-extra || true + missing="" + for m in $mods; do sudo modprobe "$m" 2>/dev/null || missing=1; done fi printf '%s\n' $mods | sudo tee /etc/modules-load.d/tracebloc.conf >/dev/null 2>&1 || true + + # Still unloadable, but the module file exists for a DIFFERENT (installed but + # not-yet-booted) kernel → a reboot will bring it in via modules-load.d. + if [[ -n "$missing" ]] \ + && ! find "/lib/modules/$(uname -r)" -name 'xt_addrtype.ko*' 2>/dev/null | grep -q . \ + && find /lib/modules -name 'xt_addrtype.ko*' 2>/dev/null | grep -q .; then + KMODS_REBOOT_REQUIRED=1 + fi } # ── Docker Engine ──────────────────────────────────────────────────────────── @@ -93,9 +112,19 @@ install_docker_engine() { # send the user in circles, as it can't fix a crashing daemon). if ! sudo systemctl is-active --quiet docker 2>/dev/null; then echo "" + # Modules were just installed for a newer, not-yet-booted kernel → the only + # remedy is a reboot; a re-run without it would loop on the same failure. + if [[ -n "${KMODS_REBOOT_REQUIRED:-}" ]]; then + warn "Docker can't start yet: the netfilter kernel modules it needs were just installed" + hint "for a newer kernel that isn't running. Reboot to load it, then re-run this installer:" + hint " sudo reboot" + hint "(The modules are pinned in /etc/modules-load.d/tracebloc.conf and load automatically on boot.)" + echo "" + error "Reboot required to finish Docker setup. Reboot, then re-run this installer." + fi warn "Docker is installed, but its daemon won't start — this is a Docker/host issue, not tracebloc." hint "If the error below mentions 'addrtype' / 'missing kernel module', the host lacks the" - hint "netfilter modules Docker needs — try: sudo dnf install -y kernel-modules-\$(uname -r) && sudo reboot" + hint "netfilter modules Docker needs — try: sudo dnf install -y kernel-modules-extra && sudo reboot" hint "Other causes: SELinux, an overlay storage-driver issue, or low /var/lib/docker disk. Docker's error:" { sudo systemctl status docker.service --no-pager -l 2>&1 | tail -6 sudo journalctl -u docker.service --no-pager 2>/dev/null \