Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions scripts/lib/setup-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
# setup-linux.sh — Linux prerequisites: package manager, Docker Engine,
# system deps, kubectl, k3d, helm, GPU dispatch
# =============================================================================
#
# ── Progress contract: no silent op longer than a few seconds ────────────────
# Anything that can block for more than ~5s MUST stay visibly alive — either a
# spin_cmd spinner (animates while a backgrounded command runs) or an explicit
# heartbeat (see wait_apt_lock below). A blocked step with no output reads as a
# freeze and gets aborted by users. Known long ops in the install journey:
# • apt/dnf install + index update → spin_cmd (animated)
# • waiting on the dpkg/apt lock → wait_apt_lock (heartbeat, NOT a spinner;
# a spinner over a blocked apt is exactly
# the freeze we are fixing — see #740)
# • Docker / k3d / helm downloads → spin_cmd / download_with_progress
# • container image pulls, CLI pod → handled in cluster.sh / install-cli.sh
# Rule of thumb: if a reader can't tell a step from a hang, it needs a heartbeat.

# ── Package manager detection ────────────────────────────────────────────────
setup_pm() {
Expand All @@ -21,6 +34,72 @@ setup_pm() {
else error "No supported package manager found."; fi
}

# ── apt lock — wait VISIBLY instead of letting a spinner hide a blocked apt ───
# On a fresh cloud VM, unattended-upgrades / apt-daily grab the dpkg frontend
# lock for the first few minutes after boot. apt-get then silently blocks on it.
# Run under spin_cmd (output redirected, animated) the install looks frozen for
# minutes and users abort ("still pulling conntrack" — see #740). Probe the lock
# directly and surface the wait BEFORE we hand apt to the spinner.

# True (0) while ANY apt/dpkg lock is held by another process; false otherwise.
# Split out as its own function so it can be stubbed at the boundary in tests
# (the bats suite can't take a real kernel lock). Uses fuser (psmisc, present on
# Debian/Ubuntu base images); if fuser is missing we can't probe → report "free"
# so we never block on an unknowable state, and let apt's own waiting take over.
_apt_lock_held() {
has fuser || return 1
local f
for f in /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/lib/dpkg/lock; do
# fuser exits 0 only when at least one PID has the file open. stderr carries
# the PID list, so silence it; we only care about the exit status.
if fuser "$f" >/dev/null 2>&1; then return 0; fi
done
return 1
}

# Best-effort: which service is most likely holding it, for the timeout hint.
_apt_lock_holder_hint() {
if pgrep -a unattended-upgr >/dev/null 2>&1; then echo "unattended-upgrades"
elif pgrep -a apt >/dev/null 2>&1; then echo "apt-daily"
else echo "another package manager"; fi
}

# Block until the apt lock clears or TRACEBLOC_APT_LOCK_TIMEOUT seconds elapse,
# emitting a clear message + a periodic heartbeat so it is obviously alive.
# Returns 0 if the lock cleared (or was never held), 1 on timeout. apt-only.
wait_apt_lock() {
has apt-get || return 0 # apt path only; other PMs out of scope (#740)
_apt_lock_held || return 0 # fast path: lock is free, say nothing

local timeout="${TRACEBLOC_APT_LOCK_TIMEOUT:-300}"
local interval=5 waited=0

info "Waiting for the system package lock — unattended-upgrades can hold it for"
hint "a few minutes on a fresh VM. This is normal; the installer is not stuck."

while _apt_lock_held; do
if (( waited >= timeout )); then
local holder; holder="$(_apt_lock_holder_hint)"
echo ""
warn "The system package lock is still held after ${timeout}s (likely ${holder})."
hint "Continuing anyway — apt will queue behind it. If the next step stalls,"
hint "let the background update finish, then re-run this installer. To inspect:"
hint " sudo lsof /var/lib/dpkg/lock-frontend"
hint " systemctl status unattended-upgrades apt-daily.service 2>/dev/null"
return 1
fi
# Heartbeat on the same line so the screen doesn't scroll, with an elapsed
# counter that visibly ticks (proof of life, not a frozen spinner).
printf "\r ${DIM}· still waiting for the package lock… %ds${RESET}" "$waited"
sleep "$interval"
waited=$(( waited + interval ))
done

printf "\r\033[K" # clear the heartbeat line
info "System package lock released — continuing."
return 0
}

# ── Kernel modules Docker + k3s need ─────────────────────────────────────────
# Docker's bridge driver programs iptables NAT rules using the `addrtype` match
# (xt_addrtype), and k3s needs br_netfilter + overlay. On minimal RHEL/AlmaLinux
Expand Down Expand Up @@ -162,6 +241,10 @@ install_system_deps() {
has openssl || MISSING_PKGS+=(openssl)
has tar || MISSING_PKGS+=(tar)
if [[ ${#MISSING_PKGS[@]} -gt 0 ]]; then
# Surface a held dpkg lock BEFORE the spinner hides it (apt-only no-op
# elsewhere). Without this, a fresh-VM unattended-upgrades hold makes the
# update/install below look frozen for minutes → users abort (#740).
wait_apt_lock
spin_cmd "Updating package index…" $PM_UPDATE
for pkg in "${MISSING_PKGS[@]}"; do
spin_cmd "Installing $pkg…" $PM_INSTALL "$pkg" || \
Expand Down
77 changes: 77 additions & 0 deletions scripts/tests/setup-linux.bats
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,83 @@ setup() {
[[ "$output" != *"logging out"* ]] # the misleading group hint is NOT used
}

# ── wait_apt_lock: visible wait on a held dpkg lock (#740) ─────────────────
# A fresh-VM unattended-upgrades hold makes apt block silently under the
# spinner → perceived freeze → users abort. wait_apt_lock surfaces the wait
# with a heartbeat and is bounded (proceed-or-timeout), never an infinite spin.
#
# The bats sandbox can't take a real kernel lock, so we mock at the function
# boundary: _apt_lock_held is the single lock probe, and we make it report
# "held" for the first N calls then "free" (simulating unattended-upgrades
# releasing the lock). `sleep` is stubbed so the loop doesn't actually wait.

# (a) lock clears after a few probes → emits wait message, then proceeds.
@test "wait_apt_lock: held lock emits a visible wait, then proceeds when it clears" {
PRESENT_CMDS="apt-get"
sleep() { :; } # don't actually wait between probes
# locked for the first 2 probes, free afterwards
_LOCK_PROBES=0
_apt_lock_held() { _LOCK_PROBES=$((_LOCK_PROBES + 1)); [ "$_LOCK_PROBES" -le 2 ]; }
run wait_apt_lock
[ "$status" -eq 0 ] # proceeded (lock cleared)
[[ "$output" == *"Waiting for the system package lock"* ]] # (a) wait message
[[ "$output" == *"released"* ]] # (b) noticed it cleared and continued
}

# (b) lock NEVER clears → bounded timeout: warns with guidance, returns 1,
# does NOT loop forever. Tiny timeout keeps the test instant.
@test "wait_apt_lock: never-clearing lock times out cleanly (no infinite spin)" {
PRESENT_CMDS="apt-get"
sleep() { :; }
_apt_lock_held() { return 0; } # held forever
pgrep() { return 1; } # holder-hint probe: generic fallback
TRACEBLOC_APT_LOCK_TIMEOUT=10 # short bound for the test
run wait_apt_lock
[ "$status" -eq 1 ] # timed out (did not hang)
[[ "$output" == *"still held after 10s"* ]]
[[ "$output" == *"re-run this installer"* ]] # actionable guidance
}

# (c) lock free from the start → completely silent fast-path (no noise on the
# common case where nothing holds the lock).
@test "wait_apt_lock: free lock is a silent no-op" {
PRESENT_CMDS="apt-get"
_apt_lock_held() { return 1; } # never held
run wait_apt_lock
[ "$status" -eq 0 ]
[ -z "$output" ] # nothing printed
}

# (d) non-apt package manager → no-op (scope is apt-only, #740). Even if a lock
# probe WOULD report held, dnf/yum/etc. must not wait on the apt lock.
@test "wait_apt_lock: non-apt distro skips the apt lock wait entirely" {
PRESENT_CMDS="dnf" # apt-get absent
_apt_lock_held() { return 0; } # would block IF it were ever probed
run wait_apt_lock
[ "$status" -eq 0 ]
[ -z "$output" ]
}

# install_system_deps must run the lock wait BEFORE the spinner that would
# otherwise hide a blocked apt. Assert the wait fires (apt path, lock held once).
@test "install_system_deps: waits on the apt lock before the install spinner (#740)" {
PRESENT_CMDS="apt-get curl" # apt present, conntrack missing → installs
sleep() { :; }
_LOCK_PROBES=0
_apt_lock_held() { _LOCK_PROBES=$((_LOCK_PROBES + 1)); [ "$_LOCK_PROBES" -le 1 ]; }
run install_system_deps
[ "$status" -eq 0 ]
[[ "$output" == *"Waiting for the system package lock"* ]]
}

# _apt_lock_held: with no fuser available we cannot probe → report "free" so we
# never block on an unknowable state (apt's own waiting then takes over).
@test "_apt_lock_held: no fuser -> reports free (does not block)" {
has() { [ "$1" != fuser ]; } # everything present except fuser
run _apt_lock_held
[ "$status" -ne 0 ] # "free" (non-zero = lock not held)
}

# Asad's root cause: minimal AlmaLinux lacks xt_addrtype -> dockerd bridge init fails.
@test "_ensure_kernel_modules: modprobes modules + installs kernel-modules on a load failure" {
has() { [[ "$1" == "dnf" ]]; }
Expand Down
Loading