Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 3 additions & 27 deletions .github/workflows/helm-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@ on:
paths:
- 'client/**'
- 'ingestor/**'
- 'scripts/**'
- '.github/workflows/helm-ci.yaml'
pull_request:
branches: [main, develop, openshift]
paths:
- 'client/**'
- 'ingestor/**'
- 'scripts/**'
- '.github/workflows/helm-ci.yaml'

jobs:
Expand Down Expand Up @@ -118,28 +116,6 @@ jobs:
> /dev/null
echo "Schema validation passed for ${{ matrix.platform }}"

installer-tests:
name: Installer script tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install bats
run: sudo apt-get update -qq && sudo apt-get install -y -qq bats

- name: bats unit tests (bash installer)
run: bats scripts/tests/*.bats

- name: Pester unit tests (PowerShell installer)
shell: pwsh
env:
TB_PESTER: "1"
run: |
Set-PSRepository PSGallery -InstallationPolicy Trusted
Install-Module Pester -MinimumVersion 5.5.0 -Force -SkipPublisherCheck -Scope CurrentUser
Import-Module Pester -MinimumVersion 5.5.0 -Force
$cfg = New-PesterConfiguration
$cfg.Run.Path = "scripts/tests/install-k8s.Tests.ps1"
$cfg.Run.Exit = $true
$cfg.Output.Verbosity = "Detailed"
Invoke-Pester -Configuration $cfg
# Installer script tests (bats + Pester) + the cross-distro prerequisite matrix
# live in their own workflow: .github/workflows/installer-tests.yaml
# (triggered on scripts/** changes).
174 changes: 174 additions & 0 deletions .github/workflows/installer-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
name: Installer tests

# Validates the curl/PowerShell installer (scripts/) across the breadth of
# environments a customer might actually have:
# β€’ static β€” shellcheck + bash -n + PSScriptAnalyzer
# β€’ unit-bash β€” bats (mocked) for the bash installer
# β€’ unit-pesterβ€” Pester for the PowerShell installer, on Linux AND real Windows
# β€’ distro-prereqs β€” runs the REAL Linux prerequisite-install path (package
# manager, system deps, Docker branch, kernel modules, kubectl/
# k3d/helm) inside a fresh container for each major distro family.
# This is what catches "works on Ubuntu, breaks on minimal RHEL"
# bugs that mocked unit tests can't see.
on:
push:
branches: [main, develop, openshift]
paths:
- 'scripts/**'
- '.github/workflows/installer-tests.yaml'
pull_request:
branches: [main, develop, openshift]
paths:
- 'scripts/**'
- '.github/workflows/installer-tests.yaml'
schedule:
- cron: '0 3 * * 1' # Mondays 03:00 UTC β€” catch drift as distro base images move
workflow_dispatch:

permissions:
contents: read

jobs:
static:
name: Static analysis
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: bash -n (syntax) on every shell script
run: |
# .bats files are bats DSL (@test "name" { … }), not valid bash β€” they are
# syntax-checked by actually running them in the unit-bash job. Parse the
# real shell scripts here.
find scripts -type f -name '*.sh' -print0 \
| while IFS= read -r -d '' f; do bash -n "$f" || exit 1; done
echo "all shell scripts parse"

- name: ShellCheck (libs + entrypoints)
run: |
sudo apt-get update -qq && sudo apt-get install -y -qq shellcheck
shellcheck --version | grep version
# Gate at error severity. The libs are sourced together as one program,
# so single-file shellcheck reports SC2034 "unused" false positives for
# shared vars (CURL_SECURE, ARCH_DL, colours…) that are defined in
# common.sh and consumed in other sourced files. Warnings are printed
# below for visibility but don't fail the gate.
shellcheck --severity=error --shell=bash \
scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh
echo "── shellcheck warnings (advisory, non-blocking) ──"
shellcheck --severity=warning --shell=bash \
scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh || true

- name: PSScriptAnalyzer (PowerShell installer)
shell: pwsh
run: |
Set-PSRepository PSGallery -InstallationPolicy Trusted
Install-Module PSScriptAnalyzer -Force -SkipPublisherCheck -Scope CurrentUser
$issues = Invoke-ScriptAnalyzer -Path scripts/install-k8s.ps1 -Severity Error,Warning
if ($issues) { $issues | Format-Table -AutoSize }
$errs = @($issues | Where-Object { $_.Severity -eq 'Error' })
if ($errs.Count -gt 0) { Write-Error "PSScriptAnalyzer: $($errs.Count) error(s)"; exit 1 }
Write-Host "no PSScriptAnalyzer errors"

unit-bash:
name: bats (bash unit, mocked)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install bats
run: sudo apt-get update -qq && sudo apt-get install -y -qq bats
- name: Run bats
run: bats scripts/tests/*.bats

unit-pester:
# Pester on Linux pwsh (fast) AND real Windows β€” the .ps1 installer's actual
# target. fail-fast:false so a Windows-only surprise doesn't mask Linux signal.
name: Pester (${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Run Pester
shell: pwsh
env:
TB_PESTER: "1"
run: |
Set-PSRepository PSGallery -InstallationPolicy Trusted
Install-Module Pester -MinimumVersion 5.5.0 -Force -SkipPublisherCheck -Scope CurrentUser
Import-Module Pester -MinimumVersion 5.5.0 -Force
$cfg = New-PesterConfiguration
$cfg.Run.Path = "scripts/tests/install-k8s.Tests.ps1"
$cfg.Run.Exit = $true
$cfg.Output.Verbosity = "Detailed"
Invoke-Pester -Configuration $cfg

distro-prereqs:
# Runs the installer's REAL Linux prerequisite path in a fresh container for
# each distro family. Proves the package-manager / Docker / conntrack / helm
# branches all resolve and install β€” the layer where every installer bug we
# have shipped lived (#718 PATH, #719 RHEL docker-ce, #720 conntrack-tools,
# the AlmaLinux kernel-modules gap, the Amazon Linux openssl/tar gap).
# It does NOT start dockerd or create a cluster β€” that needs a real kernel +
# systemd, covered by local VMs and (optionally) a future e2e job. Arch is
# omitted: its official image is x86-only and bare containers need keyring
# bootstrapping; the pacman branch is covered by the bats unit test.
name: Prereqs β€” ${{ matrix.distro }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
distro:
- 'ubuntu:22.04' # apt / get.docker.com β€” most common server
- 'ubuntu:24.04' # newest LTS
- 'debian:12' # apt
- 'almalinux:9' # dnf + docker-ce repo (RHEL rebuild, #719)
- 'almalinux:8' # older RHEL rebuild
- 'rockylinux:9' # the other RHEL rebuild
- 'amazonlinux:2023' # dnf + 'docker' pkg β€” common AWS default
- 'fedora:latest' # dnf, falls through to get.docker.com
- 'opensuse/leap:15.6' # zypper
steps:
- uses: actions/checkout@v4
- name: Install prerequisites in ${{ matrix.distro }}
env:
DISTRO: ${{ matrix.distro }}
run: |
docker run --rm -v "$PWD:/src:ro" -w /src "$DISTRO" \
bash scripts/tests/distro-prereqs.sh

e2e-cluster:
# Highest-fidelity check CI can run: brings up an ACTUAL k3d cluster via the
# installer's own create_cluster() on a real kernel (Docker is preinstalled
# on the runner), proves it can schedule + run a public workload, then tears
# down. Stops BEFORE the tracebloc helm install / backend registration (those
# need private images + real credentials), so it needs no secrets. Runs on
# both amd64 and arm64 Ubuntu runners (arm64 is free on this public repo).
name: E2E cluster (${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-24.04, ubuntu-24.04-arm]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Bring up a real k3d cluster + run a workload
run: bash scripts/tests/e2e-cluster.sh

e2e-proxy:
# Authenticated corporate-proxy E2E (the CharitΓ©/hospital archetype): stands
# up a squid that REQUIRES basic auth, brings the cluster up with the
# installer's proxy config pointed at it as user:pass@host, and proves the
# nodes pull a workload image THROUGH the authed proxy (the squid log shows an
# authenticated auth.docker.io CONNECT, which only a real pull makes). Guards
# the #172/#174 proxy hardening end-to-end. Single runner (arch-agnostic). No secrets.
name: E2E auth-proxy (squid)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Cluster up through an authenticated proxy
run: bash scripts/tests/e2e-proxy.sh
91 changes: 89 additions & 2 deletions scripts/install-k8s.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# =============================================================================

#Requires -Version 5.1
param([switch]$Help, [switch]$NoReboot)
param([switch]$Help, [switch]$NoReboot, [switch]$Diagnose)

# -- Admin check --------------------------------------------------------------
# $env:TB_PESTER lets the test suite dot-source this file to load the functions
Expand Down Expand Up @@ -965,7 +965,9 @@ function Get-TraceblocYamlValue {
# Resolve the backend base URL the same way jobs-manager does
# (client-runtime/controller.py: CLIENT_ENV -> backend), defaulting to prod.
function Get-BackendUrl {
switch ($env:CLIENT_ENV) {
# Quote the value so a truly-unset CLIENT_ENV ($null) coerces to "" and the
# default (prod) branch reliably fires across PowerShell versions.
switch ("$env:CLIENT_ENV") {
"dev" { return "https://dev-api.tracebloc.io/" }
"stg" { return "https://stg-api.tracebloc.io/" }
default { return "https://api.tracebloc.io/" }
Expand Down Expand Up @@ -1428,13 +1430,98 @@ function Test-Preflight {
}
}

# =============================================================================
# DIAGNOSE β€” `-Diagnose` support bundle (mirrors scripts/lib/diagnose.sh)
# =============================================================================

# Redact secrets from a file IN PLACE. Applied to every collected file before
# archiving. Single-quoted replacement strings keep $1 literal for the regex.
# Written UTF-8 without BOM.
function Edit-Redaction([string]$Path) {
if (-not (Test-Path $Path)) { return }
try {
$t = Get-Content -Path $Path -Raw -ErrorAction Stop
# First rule redacts ANY *password key (clientPassword, dockerRegistry
# password, HTTP_PROXY_PASSWORD, ...) in : or = form, not just clientPassword.
$t = $t -replace '(?i)([A-Za-z0-9_.-]*password\s*[:=]\s*).*', '$1[REDACTED]'
$t = $t -replace '([a-zA-Z][a-zA-Z0-9+.-]*://)[^:/@\s]+:[^@/\s]+@', '$1[REDACTED]@'
$t = $t -replace '(?i)((token|secret|authorization|api[_-]?key)\s*[:=]\s*).*', '$1[REDACTED]'
$utf8NoBom = New-Object System.Text.UTF8Encoding($false)
[System.IO.File]::WriteAllText($Path, $t, $utf8NoBom)
} catch {}
}

function Invoke-DiagnoseBundle {
$ts = Get-Date -Format 'yyyyMMdd-HHmmss'
$base = if ($HOST_DATA_DIR) { $HOST_DATA_DIR } else { "$env:USERPROFILE\.tracebloc" }
$cn = if ($CLUSTER_NAME) { $CLUSTER_NAME } else { "tracebloc" }
New-Item -ItemType Directory -Path $base -Force -ErrorAction SilentlyContinue | Out-Null
$work = Join-Path ([System.IO.Path]::GetTempPath()) ("tracebloc-diag-" + [System.IO.Path]::GetRandomFileName())
$d = Join-Path $work "tracebloc-diagnose-$ts"
New-Item -ItemType Directory -Path (Join-Path $d "logs") -Force | Out-Null

Info "Collecting diagnostics -- this is safe; credentials are redacted before the file is written."

# Namespace discovery (TB_NAMESPACE isn't set on a standalone diagnose run).
$ns = $TB_NAMESPACE
if (-not $ns) {
$jm = kubectl get pods -A 2>$null | Select-String '\-jobs-manager' | Select-Object -First 1
if ($jm) { $ns = ($jm.ToString().Trim() -split '\s+')[0] }
}
if (-not $ns) { $ns = "default" }

# host / versions
$h = @("# tracebloc diagnose ($ts)", "OS: Windows ARCH: $(Get-WindowsArch)",
"CLIENT_ENV: $($env:CLIENT_ENV) CLUSTER_NAME: $cn NAMESPACE: $ns", "## versions",
(k3d version 2>&1 | Out-String), (kubectl version --client 2>&1 | Out-String),
(helm version --short 2>&1 | Out-String), (docker version 2>&1 | Out-String))
try { $cs = Get-CimInstance Win32_ComputerSystem -ErrorAction Stop; $h += "CPUs=$($cs.NumberOfLogicalProcessors) MemBytes=$($cs.TotalPhysicalMemory)" } catch {}
($h -join "`n") | Out-File (Join-Path $d "00-host.txt") -Encoding utf8

((docker ps -a --filter "name=k3d-$cn-" 2>&1 | Out-String) + "`n" + (k3d cluster list 2>&1 | Out-String)) | Out-File (Join-Path $d "01-docker.txt") -Encoding utf8

if (Get-Command kubectl -ErrorAction SilentlyContinue) {
(@("## nodes", (kubectl get nodes -o wide 2>&1 | Out-String),
"## pods", (kubectl get pods -A -o wide 2>&1 | Out-String),
"## events", (kubectl get events -A 2>&1 | Out-String)) -join "`n") | Out-File (Join-Path $d "02-kubectl.txt") -Encoding utf8
foreach ($w in @("mysql-client", "$ns-jobs-manager", "$ns-requests-proxy")) {
kubectl logs -n $ns "deploy/$w" --all-containers --tail=500 2>&1 | Out-File (Join-Path $d "logs/$w.log") -Encoding utf8
}
}
if (Get-Command helm -ErrorAction SilentlyContinue) {
(@("## helm list", (helm list -A 2>&1 | Out-String), "## values", (helm get values $ns -n $ns 2>&1 | Out-String)) -join "`n") | Out-File (Join-Path $d "04-helm.txt") -Encoding utf8
}

Get-ChildItem -Path $base -Filter "install-*.log" -ErrorAction SilentlyContinue | ForEach-Object { Copy-Item $_.FullName (Join-Path $d $_.Name) -ErrorAction SilentlyContinue }
if (Test-Path "$base\values.yaml") { Copy-Item "$base\values.yaml" (Join-Path $d "values.yaml") -ErrorAction SilentlyContinue }

(("## proxy env`n") + ((@("HTTP_PROXY","HTTPS_PROXY","NO_PROXY") | ForEach-Object { "$_=" + [Environment]::GetEnvironmentVariable($_) }) -join "`n")) | Out-File (Join-Path $d "05-proxy.txt") -Encoding utf8

# REDACT every collected file, THEN archive.
Get-ChildItem -Path $d -Recurse -File | ForEach-Object { Edit-Redaction $_.FullName }
$bundle = Join-Path $base "tracebloc-diagnose-$ts.zip"
if (Test-Path $bundle) { Remove-Item $bundle -Force -ErrorAction SilentlyContinue }
Compress-Archive -Path $d -DestinationPath $bundle -Force -ErrorAction SilentlyContinue
Remove-Item $work -Recurse -Force -ErrorAction SilentlyContinue

Write-Host ""
if (Test-Path $bundle) {
Ok "Diagnostics saved (credentials redacted):"
Write-Host " $bundle"
Hint "Send this file to tracebloc support -- it has logs + status with passwords removed."
} else {
Write-Host " Could not create the diagnostics archive." -ForegroundColor Red
}
}

# =============================================================================
# MAIN
# =============================================================================

if (-not $env:TB_PESTER) {

if ($Help) { Print-Help }
if ($Diagnose) { Invoke-DiagnoseBundle; exit 0 }

Confirm-Config
Initialize-ToolDir
Expand Down
5 changes: 5 additions & 0 deletions scripts/install-k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,17 @@ source "${LIB_DIR}/cluster.sh"
source "${LIB_DIR}/gpu-plugins.sh"
source "${LIB_DIR}/install-client-helm.sh"
source "${LIB_DIR}/summary.sh"
source "${LIB_DIR}/diagnose.sh"

trap install_cleanup EXIT

# ── Main ─────────────────────────────────────────────────────────────────────
main() {
[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && print_help
# Support bundle: collect redacted diagnostics and exit, before any install
# work (so it works even when the install is broken). Clear the EXIT trap so
# the post-install cleanup message doesn't fire after a diagnose run.
[[ "${1:-}" == "--diagnose" ]] && { trap - EXIT; run_diagnose; exit $?; }

validate_config
setup_log_file
Expand Down
1 change: 1 addition & 0 deletions scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ FILES=(
"scripts/lib/gpu-plugins.sh"
"scripts/lib/install-client-helm.sh"
"scripts/lib/summary.sh"
"scripts/lib/diagnose.sh"
)

download_with_retry() {
Expand Down
6 changes: 4 additions & 2 deletions scripts/lib/cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,10 @@ _create_new_cluster() {

local create_out create_rc
create_out="$(mktemp)"
k3d "${K3D_ARGS[@]}" >"$create_out" 2>&1
create_rc=$?
# Capture the exit code WITHOUT tripping `set -e`: a bare failing command here
# would abort the script immediately, skipping the 'already exists' reuse path,
# the error dump, and the temp-dir cleanup below.
k3d "${K3D_ARGS[@]}" >"$create_out" 2>&1 && create_rc=0 || create_rc=$?
[[ -n "$proxy_cfg" ]] && rm -rf "${proxy_cfg%/*}"
if [[ $create_rc -ne 0 ]]; then
if grep -qi "already exists\|a cluster with that name already exists" "$create_out" 2>/dev/null; then
Expand Down
8 changes: 7 additions & 1 deletion scripts/lib/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,13 @@ tracebloc β€” client setup

Usage:
curl -fsSL https://raw.githubusercontent.com/tracebloc/client/main/scripts/install.sh | bash
./install-k8s.sh [--help]
./install-k8s.sh [--help] [--diagnose]

Commands:
--diagnose Collect a redacted support bundle (logs + cluster/host status)
into ~/.tracebloc/tracebloc-diagnose-<timestamp>.tgz and exit.
Run this if something went wrong, then send the file to support
(passwords and proxy credentials are removed before it is written).

Advanced configuration (environment variables):
CLUSTER_NAME Cluster name (default: tracebloc)
Expand Down
Loading
Loading