tracebloc · saadqbal · Jun 8, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/scripts/install-k8s.ps1 b/scripts/install-k8s.ps1
@@ -769,6 +769,9 @@ function Set-ClusterAutostart {
 function New-K3dCluster {
   Log "Creating k3d cluster: '$CLUSTER_NAME'"
 
+  # Docker is up now (unlike at preflight); re-check the runtime's real memory budget.
+  Test-PreflightRuntimeMem
+
   $clusterExists = $false
   $clusterObj = $null
   try {
@@ -1406,23 +1409,47 @@ function Get-PfFreeGb {
   } catch { return $null }
 }
 
+# Memory/CPU as the container runtime sees it (the Docker Desktop / WSL2 VM budget,
+# which is what the pods actually get — smaller than the host). $null if the daemon
+# is down or the value is junk, so callers fall back to the host (CIM) reader.
+function Get-PfRuntimeMemGb {
+  try {
+    $v = ((docker info --format '{{.MemTotal}}' 2>$null) | Out-String).Trim()
+    if ($v -match '^\d+$' -and [int64]$v -gt 0) { return [math]::Floor([int64]$v / 1GB) }
+  } catch {}
+  return $null
+}
+function Get-PfRuntimeCpu {
+  try {
+    $v = ((docker info --format '{{.NCPU}}' 2>$null) | Out-String).Trim()
+    if ($v -match '^\d+$' -and [int]$v -gt 0) { return [int]$v }
+  } catch {}
+  return $null
+}
+
+# Prefer the runtime view, fall back to the host (CIM).
 function Get-PfMemGb {
+  $r = Get-PfRuntimeMemGb; if ($null -ne $r) { return $r }
   try { return [math]::Floor((Get-CimInstance Win32_ComputerSystem -ErrorAction Stop).TotalPhysicalMemory / 1GB) }
   catch { return $null }
 }
 
 function Get-PfCpu {
+  $r = Get-PfRuntimeCpu; if ($null -ne $r) { return $r }
   try { return [int](Get-CimInstance Win32_ComputerSystem -ErrorAction Stop).NumberOfLogicalProcessors }
   catch { if ($env:NUMBER_OF_PROCESSORS) { return [int]$env:NUMBER_OF_PROCESSORS } else { return $null } }
 }
 
 function Test-Preflight {
   if ($env:TRACEBLOC_SKIP_PREFLIGHT) { Info "Preflight checks skipped (TRACEBLOC_SKIP_PREFLIGHT set)."; return }
 
-  $minDiskGb  = if ($env:PF_MIN_DISK_GB)  { [int]$env:PF_MIN_DISK_GB }  else { 5 }
+  $minDiskGb  = if ($env:PF_MIN_DISK_GB)  { [int]$env:PF_MIN_DISK_GB }  else { 10 }
   $warnDiskGb = if ($env:PF_WARN_DISK_GB) { [int]$env:PF_WARN_DISK_GB } else { 20 }
-  $warnMemGb  = if ($env:PF_WARN_MEM_GB)  { [int]$env:PF_WARN_MEM_GB }  else { 4 }
+  $minMemGb   = if ($env:PF_MIN_MEM_GB)   { [int]$env:PF_MIN_MEM_GB }   else { 5 }
+  $warnMemGb  = if ($env:PF_WARN_MEM_GB)  { [int]$env:PF_WARN_MEM_GB }  else { 8 }
+  $recMemGb   = if ($env:PF_REC_MEM_GB)   { [int]$env:PF_REC_MEM_GB }   else { 16 }
   $minCpu     = if ($env:PF_MIN_CPU)      { [int]$env:PF_MIN_CPU }      else { 2 }
+  $recCpu     = if ($env:PF_REC_CPU)      { [int]$env:PF_REC_CPU }      else { 4 }
   $hardFail   = 0
 
   # Architecture — the tracebloc client images (e.g. mysql-client) are amd64-only.
@@ -1437,12 +1464,22 @@ function Test-Preflight {
 
   $cpu = Get-PfCpu
   if      ($null -eq $cpu)   { Warn "CPU: couldn't determine core count (skipping)." }
-  elseif  ($cpu -lt $minCpu) { Warn "CPU: $cpu core(s) - recommended >= $minCpu." }
+  elseif  ($cpu -lt $minCpu) { Warn "CPU: $cpu core(s) - below the $minCpu-core minimum; mysql may hit lock-wait timeouts. $recCpu+ recommended to train." }
+  elseif  ($cpu -lt $recCpu) { Warn "CPU: $cpu cores - fine to run; $recCpu+ recommended to train locally." }
   else                       { Ok "CPU: $cpu cores" }
 
+  # Memory is warn-only on Windows: at preflight the Docker Desktop / WSL2 daemon may
+  # be down (so this is host RAM); the post-Docker re-check sees the real VM budget.
   $mem = Get-PfMemGb
   if      ($null -eq $mem)      { Warn "Memory: couldn't determine total RAM (skipping)." }
-  elseif  ($mem -lt $warnMemGb) { Warn "Memory: $mem GB total - recommended >= $warnMemGb GB; k3s + training may run out of memory." }
+  elseif  ($mem -lt $minMemGb)  {
+    Warn "Memory: $mem GB - below the $minMemGb GB the client needs; it will OOM."
+    Hint "Docker Desktop -> Settings -> Resources -> Memory: raise to >= $warnMemGb GB ($recMemGb GB to train), then re-run."
+  }
+  elseif  ($mem -lt $warnMemGb) {
+    Warn "Memory: $mem GB - enough to run, but training (~8 GB/job) may OOM; $recMemGb GB recommended to train locally."
+    Hint "Docker Desktop -> Settings -> Resources -> Memory >= $recMemGb GB to train."
+  }
   else                          { Ok "Memory: $mem GB" }
 
   $disk = Get-PfFreeGb
@@ -1479,6 +1516,22 @@ function Test-Preflight {
   }
 }
 
+# Re-evaluate memory once Docker is confirmed up. Test-Preflight runs before Docker
+# Desktop starts, so its read may have been host RAM, not the (smaller) Docker VM
+# budget. Called from New-K3dCluster. WARN-only — the user has already waited for
+# Docker, so aborting here would be jarring.
+function Test-PreflightRuntimeMem {
+  if ($env:TRACEBLOC_SKIP_PREFLIGHT) { return }
+  $mem = Get-PfRuntimeMemGb
+  if ($null -eq $mem) { return }
+  $warnMemGb = if ($env:PF_WARN_MEM_GB) { [int]$env:PF_WARN_MEM_GB } else { 8 }
+  $recMemGb  = if ($env:PF_REC_MEM_GB)  { [int]$env:PF_REC_MEM_GB }  else { 16 }
+  if ($mem -lt $warnMemGb) {
+    Warn "Docker is running with $mem GB - recommended >= $warnMemGb GB ($recMemGb GB to train); the client may OOM under load."
+    Hint "Docker Desktop -> Settings -> Resources -> Memory >= $warnMemGb GB, then re-install."
+  }
+}
+
 # =============================================================================
 #  DIAGNOSE — `-Diagnose` support bundle (mirrors scripts/lib/diagnose.sh)
 # =============================================================================

diff --git a/scripts/lib/cluster.sh b/scripts/lib/cluster.sh
@@ -119,6 +119,11 @@ create_cluster() {
 
   _ensure_tracebloc_dirs
 
+  # Docker is up now (unlike at preflight time), so re-check the runtime's real
+  # memory budget — a too-small Docker VM (Mac/Win) surfaces before we build out.
+  # Guarded: cluster.sh can be sourced without preflight.sh (e.g. the e2e harness).
+  if declare -F _pf_recheck_runtime_mem >/dev/null 2>&1; then _pf_recheck_runtime_mem || true; fi
+
   if _cluster_exists; then
     _handle_existing_cluster
   else

diff --git a/scripts/lib/preflight.sh b/scripts/lib/preflight.sh
@@ -10,15 +10,23 @@
 #  Escape hatches:
 #    TRACEBLOC_SKIP_PREFLIGHT=1   skip all checks
 #    TRACEBLOC_ALLOW_ARM64=1      proceed on arm64 despite amd64-only images
+#    PF_MIN_MEM_GB / PF_MIN_CPU / PF_MIN_DISK_GB   lower the hard floors (CI / odd sites)
 #
 #  This file is side-effect-safe to source (defaults + function defs only).
 # =============================================================================
 
-# Thresholds (overridable via env — for unusual sites or tests)
-PF_MIN_DISK_GB="${PF_MIN_DISK_GB:-5}"      # hard-fail below this (Linux)
+# Thresholds (overridable via env — for unusual sites or tests).
+# RAM floors are derived from the real stack, not guessed: the always-on control
+# plane requests ~2.1 GiB, + k3s/k3d ~0.8 + OS/Docker ~0.7 ≈ ~4.4 GiB just to stay
+# Online on a single-node (k3d) install — so below 5 GiB it boots then OOMs. 8 GiB
+# is comfortable to run; 16 GiB is needed to train locally (a job's limit is ~8 GiB+).
+PF_MIN_DISK_GB="${PF_MIN_DISK_GB:-10}"     # hard-fail below this (Linux) — base images alone need >5
 PF_WARN_DISK_GB="${PF_WARN_DISK_GB:-20}"   # warn below this
-PF_WARN_MEM_GB="${PF_WARN_MEM_GB:-4}"      # warn below this
+PF_MIN_MEM_GB="${PF_MIN_MEM_GB:-5}"        # hard-fail below this (Linux; warn on Mac/Win)
+PF_WARN_MEM_GB="${PF_WARN_MEM_GB:-8}"      # warn below this (comfortable to run)
+PF_REC_MEM_GB="${PF_REC_MEM_GB:-16}"       # recommended to train locally (copy only, not a gate)
 PF_MIN_CPU="${PF_MIN_CPU:-2}"              # warn below this
+PF_REC_CPU="${PF_REC_CPU:-4}"              # recommended (warn) below this
 
 # Non-exiting failure line (common.sh's error() exits; preflight must finish all
 # checks first, so failures print here and are recorded in PF_HARD_FAIL). Writes
@@ -50,8 +58,26 @@ _pf_probe_url() {
 # Free space in KB on the filesystem holding $1.
 _pf_free_kb() { df -Pk "$1" 2>/dev/null | awk 'NR==2 {print $4}'; }
 
-# Total physical RAM in KB.
-_pf_total_mem_kb() {
+# Memory/CPU as the CONTAINER RUNTIME sees it (the budget the pods actually get).
+# On Docker Desktop / Colima / WSL2 this is the VM's allocation — smaller than the
+# host and the number that matters (a 36 GB Mac can cap its Docker VM at 4 GB). Echo
+# a single integer, or nothing if the daemon is down / the value is junk — callers
+# then fall back to the host reader. (docker info precedent: _pf_docker_root above.)
+_pf_runtime_mem_kb() {
+  has docker && docker info >/dev/null 2>&1 || return 0
+  local b; b="$(docker info --format '{{.MemTotal}}' 2>/dev/null)"
+  [[ "$b" =~ ^[0-9]+$ && "$b" -gt 0 ]] && echo $(( b / 1024 ))
+  return 0
+}
+_pf_runtime_ncpu() {
+  has docker && docker info >/dev/null 2>&1 || return 0
+  local n; n="$(docker info --format '{{.NCPU}}' 2>/dev/null)"
+  [[ "$n" =~ ^[0-9]+$ && "$n" -gt 0 ]] && echo "$n"
+  return 0
+}
+
+# Total physical RAM of the HOST in KB.
+_pf_host_mem_kb() {
   if [[ "$OS" == "Darwin" ]]; then
     local b; b=$(sysctl -n hw.memsize 2>/dev/null) || b=""
     [[ -n "$b" ]] && echo $(( b / 1024 ))
@@ -60,15 +86,23 @@ _pf_total_mem_kb() {
   fi
 }
 
-# Logical CPU count.
-_pf_ncpu() {
+# Logical CPU count of the HOST.
+_pf_host_ncpu() {
   if [[ "$OS" == "Darwin" ]]; then
     sysctl -n hw.ncpu 2>/dev/null
   else
     nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null
   fi
 }
 
+# Available (free) RAM right now, KB — Linux only (for the busy-shared-VM warn).
+_pf_avail_mem_kb() { awk '/^MemAvailable:/ {print $2}' /proc/meminfo 2>/dev/null; }
+
+# Selectors: prefer the runtime view, fall back to the host. The checks (and the
+# bats numeric test) call these names; they always emit exactly one integer.
+_pf_total_mem_kb() { local v; v="$(_pf_runtime_mem_kb)"; [[ -n "$v" ]] && { echo "$v"; return 0; }; _pf_host_mem_kb; }
+_pf_ncpu()         { local v; v="$(_pf_runtime_ncpu)";   [[ -n "$v" ]] && { echo "$v"; return 0; }; _pf_host_ncpu; }
+
 # Docker data root if the daemon is up; else where it will live / a host proxy.
 _pf_docker_root() {
   if has docker && docker info >/dev/null 2>&1; then
@@ -124,22 +158,75 @@ _pf_arch() {
 _pf_cpu() {
   local n; n="$(_pf_ncpu)"
   if [[ -z "$n" ]]; then warn "CPU: couldn't determine core count (skipping)."; return 0; fi
+  # CPU is warn-only: starvation throttles (and can trip mysql InnoDB lock-wait
+  # timeouts) but doesn't OOM-kill, and the chart deliberately omits limits.cpu.
   if [[ "$n" -lt "$PF_MIN_CPU" ]]; then
-    warn "CPU: ${n} core(s) — recommended ≥ ${PF_MIN_CPU}."
+    warn "CPU: ${n} core(s) — below the ${PF_MIN_CPU}-core minimum; mysql may hit lock-wait timeouts. ${PF_REC_CPU}+ recommended to train."
+  elif [[ "$n" -lt "$PF_REC_CPU" ]]; then
+    warn "CPU: ${n} cores — fine to run; ${PF_REC_CPU}+ recommended to train locally."
   else
     success "CPU: ${n} cores"
   fi
   return 0
 }
 
 _pf_memory() {
-  local kb gb; kb="$(_pf_total_mem_kb)"
+  local kb gb mib floor_mib warn_mib src
+  kb="$(_pf_total_mem_kb)"
   if [[ -z "$kb" ]]; then warn "Memory: couldn't determine total RAM (skipping)."; return 0; fi
   gb=$(( kb / 1024 / 1024 ))
-  if [[ "$gb" -lt "$PF_WARN_MEM_GB" ]]; then
-    warn "Memory: ${gb} GB total — recommended ≥ ${PF_WARN_MEM_GB} GB; k3s + training may run out of memory."
+  mib=$(( kb / 1024 ))
+  # Compare in MiB with a 64 MiB grace so a VM that reports e.g. 4 GiB a hair under
+  # 4*1024^3 (Colima / Docker Desktop) doesn't floor to 3 GB and false-trip the gate.
+  floor_mib=$(( PF_MIN_MEM_GB * 1024 - 64 ))
+  warn_mib=$(( PF_WARN_MEM_GB * 1024 ))
+  src="host"; [[ -n "$(_pf_runtime_mem_kb)" ]] && src="Docker VM"
+
+  if [[ "$mib" -lt "$floor_mib" ]]; then
+    if [[ "$OS" == "Linux" ]]; then
+      _pf_fail_line "Memory: only ${gb} GB (${src}) — need ≥ ${PF_MIN_MEM_GB} GB to run the tracebloc client."
+      PF_HARD_FAIL=$(( ${PF_HARD_FAIL:-0} + 1 ))
+      hint "Resize the VM (or free memory) to ≥ ${PF_WARN_MEM_GB} GB; ${PF_REC_MEM_GB} GB to train locally. Then re-run."
+    else
+      # Mac/Win: at preflight Docker is usually still down, so this is host RAM —
+      # warn (don't block); the create_cluster re-check sees the real VM size.
+      warn "Memory: ${gb} GB (${src}) — below the ${PF_MIN_MEM_GB} GB the client needs; it will OOM."
+      hint "Docker Desktop → Settings → Resources → Memory: raise to ≥ ${PF_WARN_MEM_GB} GB (${PF_REC_MEM_GB} GB to train), then re-run."
+    fi
+  elif [[ "$mib" -lt "$warn_mib" ]]; then
+    warn "Memory: ${gb} GB (${src}) — enough to run, but training (≈8 GB/job) may OOM; ${PF_REC_MEM_GB} GB recommended to train locally."
+    [[ "$OS" != "Linux" ]] && hint "Docker Desktop → Settings → Resources → Memory ≥ ${PF_REC_MEM_GB} GB to train."
   else
-    success "Memory: ${gb} GB"
+    success "Memory: ${gb} GB (${src})"
+  fi
+
+  # Linux: even when total is fine, a busy shared VM may have little free RAM now.
+  if [[ "$OS" == "Linux" ]]; then
+    local avail_kb avail_gb
+    avail_kb="$(_pf_avail_mem_kb)"
+    if [[ -n "$avail_kb" ]]; then
+      avail_gb=$(( avail_kb / 1024 / 1024 ))
+      if [[ "$avail_gb" -lt "$PF_MIN_MEM_GB" ]]; then
+        warn "Memory: only ${avail_gb} GB available right now (other workloads are using this machine) — the client needs ~${PF_MIN_MEM_GB} GB free to start."
+      fi
+    fi
+  fi
+  return 0
+}
+
+# Re-evaluate memory once Docker is confirmed up. Preflight runs before Docker
+# starts (install-k8s.sh), so on macOS/Windows the first read was host RAM, not the
+# Docker VM's smaller budget. Called from create_cluster (cluster.sh) — the first
+# point `docker info` is reliably up on every OS. WARN-only: the user has already
+# waited for Docker to come up, so aborting here would be jarring.
+_pf_recheck_runtime_mem() {
+  [[ -n "${TRACEBLOC_SKIP_PREFLIGHT:-}" ]] && return 0
+  local kb gb; kb="$(_pf_runtime_mem_kb)"
+  [[ -z "$kb" ]] && return 0          # daemon still not reporting — nothing to add
+  gb=$(( kb / 1024 / 1024 ))
+  if [[ $(( kb / 1024 )) -lt $(( PF_WARN_MEM_GB * 1024 )) ]]; then
+    warn "Docker is running with ${gb} GB — recommended ≥ ${PF_WARN_MEM_GB} GB (${PF_REC_MEM_GB} GB to train); the client may OOM under load."
+    [[ "$OS" != "Linux" ]] && hint "Docker Desktop → Settings → Resources → Memory ≥ ${PF_WARN_MEM_GB} GB, then re-install."
   fi
   return 0
 }

diff --git a/scripts/lib/setup-macos.sh b/scripts/lib/setup-macos.sh
@@ -69,7 +69,9 @@ _install_docker_colima() {
     return
   fi
 
-  spin_cmd "Starting Docker runtime…" colima start --cpu 2 --memory 4 --disk 60
+  # Colima VM sizing must clear the preflight floor — the client needs ~5 GB just
+  # to run (control plane + k3s + OS), 16 GB to train locally. Overridable per box.
+  spin_cmd "Starting Docker runtime…" colima start --cpu "${COLIMA_CPU:-4}" --memory "${COLIMA_MEMORY:-6}" --disk "${COLIMA_DISK:-60}"
 
   if ! docker info &>/dev/null 2>&1; then
     error "Docker did not start. Try running 'colima status' to investigate."

diff --git a/scripts/tests/e2e-cluster.sh b/scripts/tests/e2e-cluster.sh
@@ -33,6 +33,8 @@ source "$LIB/common.sh"
 source "$LIB/setup-linux.sh"
 # shellcheck source=/dev/null
 source "$LIB/cluster.sh"
+# shellcheck source=/dev/null
+source "$LIB/preflight.sh"   # provides _pf_recheck_runtime_mem (called by create_cluster)
 
 cleanup() { k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true; }
 trap cleanup EXIT

diff --git a/scripts/tests/install-k8s.Tests.ps1 b/scripts/tests/install-k8s.Tests.ps1
@@ -453,6 +453,45 @@ Describe "Test-Preflight" {
     Mock Test-PfUrl { "ok" }
     { Test-Preflight } | Should -Not -Throw
   }
+  It "memory below floor -> warn-only on Windows (does not throw)" {
+    Mock Test-PfUrl { "ok" }; Mock Get-PfMemGb { 3 }
+    { Test-Preflight } | Should -Not -Throw
+  }
+  It "PF_MIN_MEM_GB override relaxes the floor" {
+    Mock Test-PfUrl { "ok" }; Mock Get-PfMemGb { 3 }; $env:PF_MIN_MEM_GB = "2"
+    { Test-Preflight } | Should -Not -Throw
+    $env:PF_MIN_MEM_GB = $null
+  }
+}
+
+Describe "Get-Pf* runtime (Docker VM) view preference" {
+  It "Get-PfMemGb prefers docker MemTotal over the host" {
+    Mock docker { '8589934592' }          # 8 GiB, in bytes
+    Get-PfMemGb | Should -Be 8
+  }
+  It "Get-PfCpu prefers docker NCPU over the host" {
+    Mock docker { '2' }
+    Get-PfCpu | Should -Be 2
+  }
+  It "Get-PfRuntimeMemGb: junk value -> null (forces host fallback)" {
+    Mock docker { 'lots' }
+    Get-PfRuntimeMemGb | Should -BeNullOrEmpty
+  }
+  It "Get-PfRuntimeMemGb: docker errors -> null" {
+    Mock docker { throw "daemon down" }
+    Get-PfRuntimeMemGb | Should -BeNullOrEmpty
+  }
+}
+
+Describe "Test-PreflightRuntimeMem (post-Docker, warn-only)" {
+  It "small Docker VM -> warns, does not throw" {
+    Mock Get-PfRuntimeMemGb { 4 }
+    { Test-PreflightRuntimeMem } | Should -Not -Throw
+  }
+  It "daemon not reporting (null) -> no-op, does not throw" {
+    Mock Get-PfRuntimeMemGb { $null }
+    { Test-PreflightRuntimeMem } | Should -Not -Throw
+  }
 }
 
 # --- reboot persistence (Set-ClusterAutostart) -------------------------------