From 1f715c84801c8ac1ca8413e95f8eef8e6bace393 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Wed, 17 Jun 2026 18:34:27 +0200
Subject: [PATCH 01/15] ci(security): add public-repo PII gate caller (#81)

* ci(security): add public-repo PII gate caller

Blocks PRs that leak customer/partner names or secrets in title/body/commits.
Calls the reusable gate in tracebloc/.github. Inactive until the org
PII_DENYLIST secret is set (warns, doesn't block, until then).

* chore(schema): sync ingest.v1.json from data-ingestors master

The vendored copy at internal/schema/ingest.v1.json had drifted from
upstream, failing the `scripts/sync-schema.sh --check` CI gate on every
PR. Upstream replaced `instance_segmentation` with `token_classification`
across the category enums, updated the texts/resolution descriptions
([width, height] order + PIL note), and added the token_classification
`texts` and masked_language_modeling no-`label` conditional rules.

Regenerated via `scripts/sync-schema.sh`; `--check` is now clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: shujaat hasan <shujaathasan@shujaats-MacBook-Pro.local>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/public-pii-gate-caller.yml | 14 ++++++++
 internal/schema/ingest.v1.json               | 35 ++++++++++++++++----
 2 files changed, 42 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/public-pii-gate-caller.yml

diff --git a/.github/workflows/public-pii-gate-caller.yml b/.github/workflows/public-pii-gate-caller.yml
new file mode 100644
index 0000000..4af6016
--- /dev/null
+++ b/.github/workflows/public-pii-gate-caller.yml
@@ -0,0 +1,14 @@
+name: Public PII gate
+
+# Per-repo caller for the public-repo PII gate. Blocks PRs whose title/body/
+# commits contain a denylisted customer/partner name or known secret.
+# Logic lives in tracebloc/.github/.github/workflows/public-pii-gate.yml.
+
+on:
+  pull_request:
+    types: [opened, edited, reopened, synchronize, labeled, unlabeled]
+
+jobs:
+  pii-gate:
+    uses: tracebloc/.github/.github/workflows/public-pii-gate.yml@main
+    secrets: inherit
diff --git a/internal/schema/ingest.v1.json b/internal/schema/ingest.v1.json
index d8eb83c..7f4f4a0 100644
--- a/internal/schema/ingest.v1.json
+++ b/internal/schema/ingest.v1.json
@@ -30,8 +30,8 @@
         "object_detection",
         "keypoint_detection",
         "semantic_segmentation",
-        "instance_segmentation",
         "text_classification",
+        "token_classification",
         "tabular_classification",
         "tabular_regression",
         "time_series_forecasting",
@@ -86,7 +86,7 @@
     "texts": {
       "type": "string",
       "minLength": 1,
-      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification."
+      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification and token_classification."
     },
 
     "sequences": {
@@ -143,7 +143,7 @@
       "items": { "type": "integer", "minimum": 1 },
       "minItems": 2,
       "maxItems": 2,
-      "description": "[height, width] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset."
+      "description": "[width, height] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset. The order matches PIL.Image.size and what ImageResolutionValidator expects."
     },
 
     "number_of_keypoints": {
@@ -198,7 +198,7 @@
               "items": { "type": "integer", "minimum": 1 },
               "minItems": 2,
               "maxItems": 2,
-              "description": "[height, width]. Image categories only. Default [512, 512]."
+              "description": "[width, height]. Image categories only. Default [512, 512]. The order matches PIL.Image.size and what ImageResolutionValidator expects."
             },
             "extension": {
               "type": "string",
@@ -288,8 +288,7 @@
               "image_classification",
               "object_detection",
               "keypoint_detection",
-              "semantic_segmentation",
-              "instance_segmentation"
+              "semantic_segmentation"
             ]
           }
         },
@@ -321,6 +320,14 @@
       },
       "then": { "required": ["texts"] }
     },
+    {
+      "description": "token_classification requires `texts`.",
+      "if": {
+        "properties": { "category": { "const": "token_classification" } },
+        "required": ["category"]
+      },
+      "then": { "required": ["texts"] }
+    },
     {
       "description": "masked_language_modeling requires `sequences`.",
       "if": {
@@ -388,8 +395,8 @@
               "object_detection",
               "keypoint_detection",
               "semantic_segmentation",
-              "instance_segmentation",
               "text_classification",
+              "token_classification",
               "tabular_classification"
             ]
           }
@@ -397,6 +404,20 @@
         "required": ["category"]
       },
       "then": { "required": ["label"] }
+    },
+    {
+      "description": "Self-supervised categories MUST NOT set `label`. The shipped CSV has no label column, and the framework registers no edge-label metadata for them. Setting `label` anyway used to ingest rows successfully, then crash at backend registration with a misleading HTTP 400 'No data found' (issue #213). Reject at submission instead.",
+      "if": {
+        "properties": {
+          "category": {
+            "enum": [
+              "masked_language_modeling"
+            ]
+          }
+        },
+        "required": ["category"]
+      },
+      "then": { "not": { "required": ["label"] } }
     }
   ]
 }

From 99c6c28e08c5ffa2fcc75d62cda0ef158e5ceea6 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Wed, 17 Jun 2026 18:49:55 +0200
Subject: [PATCH 02/15] ci: add concurrency cancellation + job timeouts to cli
 CI (#82)

Adds a per-ref concurrency group (cancels superseded PR runs only;
push/tag/schedule never cancelled) and timeout-minutes to every job, so
stale PR pushes stop wasting runner time and hung steps (kind boot, cosign)
can't run to the 6h default. No change to job behavior.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/build.yml   | 8 ++++++++
 .github/workflows/e2e.yml     | 5 +++++
 .github/workflows/release.yml | 7 +++++++
 3 files changed, 20 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d1c8dba..d927706 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -14,8 +14,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: build-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   schema-drift:
+    timeout-minutes: 10
     name: Schema drift check
     # Verifies the embedded internal/schema/ingest.v1.json matches
     # tracebloc/data-ingestors' master. A green PR that silently
@@ -30,6 +35,7 @@ jobs:
         run: ./scripts/sync-schema.sh --check
 
   test:
+    timeout-minutes: 15
     name: Test
     runs-on: ubuntu-latest
     steps:
@@ -52,6 +58,7 @@ jobs:
         run: go test -race -cover ./...
 
   lint:
+    timeout-minutes: 10
     name: Lint
     runs-on: ubuntu-latest
     steps:
@@ -103,6 +110,7 @@ jobs:
           misspell -error .
 
   build:
+    timeout-minutes: 20
     name: Build (${{ matrix.os }}/${{ matrix.arch }})
     runs-on: ubuntu-latest
     strategy:
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index 660921c..10a0beb 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -21,8 +21,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: e2e-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   integration:
+    timeout-minutes: 30
     name: Integration (kind)
     runs-on: ubuntu-latest
     # Skip on PRs that aren't explicitly opted in via the `e2e` label;
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1468f1c..0a8f889 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -44,8 +44,13 @@ permissions:
   contents: write  # create / update the GitHub Release
   id-token: write  # cosign keyless OIDC
 
+concurrency:
+  group: release-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   release:
+    timeout-minutes: 20
     name: Build + sign + publish
     runs-on: ubuntu-latest
     strategy:
@@ -158,6 +163,7 @@ jobs:
           if-no-files-found: error
 
   publish:
+    timeout-minutes: 20
     name: Aggregate + create GitHub Release
     runs-on: ubuntu-latest
     needs: release
@@ -220,6 +226,7 @@ jobs:
   # add a HOMEBREW_TAP_TOKEN repo secret with write access to
   # tracebloc/homebrew-tap) when that repo exists.
   bump-homebrew-tap:
+    timeout-minutes: 20
     name: Bump Homebrew tap formula
     runs-on: ubuntu-latest
     needs: publish

From d5a7c3f16deeff5629a04c88f51c5b81b4b93e7e Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:29:42 +0200
Subject: [PATCH 03/15] Merge pull request #79 from
 tracebloc/chore/sync-ingest-schema

chore(schema): re-sync vendored ingest.v1.json from data-ingestors master

From c6e80df2890ffa16e345dcddbbf9d1adced401c0 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:38:46 +0200
Subject: [PATCH 04/15] fix(dataset rm): delete staging files from a uid-65532
 pod, not jobs-manager (#259) (#78)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`tracebloc dataset rm` dropped the table but failed to delete the dataset's
staging files on the shared PVC:

  rm: cannot remove '/data/shared/.tracebloc-staging/<t>/labels.csv': Permission denied

Root cause: the staging files are written by the CLI's ephemeral stage pod as
uid 65532 (+ fsGroup 65532), but the teardown exec'd `rm` inside the long-lived
jobs-manager pod, which runs as a different non-root uid with no shared fsGroup.
A non-65532 uid cannot delete 65532-owned files in a non-group-writable dir, so
the rm hit EACCES and left orphans. The "re-run to clean up" advice was a dead
end — the same permission error every time.

Fix: run the teardown `rm` from a short-lived pod that mirrors the stage pod's
identity (uid 65532 + fsGroup 65532, shared PVC mounted), reusing the existing
BuildStagePodSpec / CreateStagePod / WaitForStagePodReady / DeleteStagePod
machinery. That pod OWNS the staging files it deletes, so it works by ownership
on hostPath (where fsGroup is a no-op, kubernetes/kubernetes#138411) and CSI
alike. Fully fixes tabular datasets (no sidecar files) on every volume type.

Also:
- Teardown now takes an injectable Executor (matching push.Stage), enabling a
  regression test that pins "rm runs in a uid-65532 stage pod, not jobs-manager".
- dataset_rm: drop the misleading "re-run completes the cleanup" claim; the table
  DROP is idempotent, and if file removal keeps failing, point to node-side cleanup.

Refs #259. The image/sidecar case (ingestor's /data/shared/<table> written as
uid 65534) on hostPath still needs the documented complement (ingestor fsGroup +
group-writable DEST_PATH in client-runtime/data-ingestors).

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
Co-authored-by: Asad Iqbal <asad.dsoft@gmail.com>
---
 internal/cli/dataset_rm.go     | 28 ++++++++---
 internal/push/teardown.go      | 54 ++++++++++++++-------
 internal/push/teardown_test.go | 89 +++++++++++++++++++++++++++++++++-
 3 files changed, 145 insertions(+), 26 deletions(-)

diff --git a/internal/cli/dataset_rm.go b/internal/cli/dataset_rm.go
index a7c398f..ecc83f6 100644
--- a/internal/cli/dataset_rm.go
+++ b/internal/cli/dataset_rm.go
@@ -178,18 +178,30 @@ undone — re-pushing the data is the only way back.`)
 		}
 	}
 
-	// 7. Execute the in-cluster teardown.
+	// 7. Execute the in-cluster teardown. File removal runs in a
+	//    short-lived pod that shares the stage pod's identity (uid
+	//    65532 + fsGroup 65532), so it owns and can delete the staging
+	//    files on any volume type — including hostPath, where fsGroup is
+	//    a no-op (tracebloc/client#259).
 	p.Infof("Removing in-cluster artifacts…")
-	res, err := push.Teardown(ctx, cs, resolved.RestConfig, resolved.Namespace, plan)
+	res, err := push.Teardown(ctx, cs, &push.SPDYExecutor{Config: resolved.RestConfig, Client: cs}, resolved.Namespace, plan, push.PodSpecOptions{
+		Namespace:          resolved.Namespace,
+		PVCClaimName:       pvc.ClaimName,
+		PVCMountPath:       pvc.MountPath,
+		Table:              a.Table,
+		ServiceAccountName: release.IngestorSAName,
+		// Image left empty → push.DefaultStagePodImage (alpine; has rm).
+	})
 	if err != nil {
-		// Teardown is two sequential destructive ops; if the table drop
-		// succeeded but file removal didn't, say so — both ops are
-		// idempotent, so re-running completes the cleanup. (Bugbot #49)
+		// Two sequential destructive ops. If the table dropped but file
+		// removal didn't, the drop is idempotent (DROP TABLE IF EXISTS),
+		// so re-running is safe; if it keeps failing, remove the leftover
+		// staging dirs on the node directly.
 		if res.DroppedTable {
 			return &exitError{code: 7, err: fmt.Errorf(
-				"teardown incomplete — the table %s.%s was dropped, but removing its files failed; "+
-					"re-run `tracebloc dataset rm %s` to remove the leftover files: %w",
-				plan.Database, plan.Table, a.Table, err)}
+				"teardown incomplete — the table %s.%s was dropped, but removing its files failed: %w; "+
+					"re-run `tracebloc dataset rm %s`, or delete the leftover staging dirs on the node",
+				plan.Database, plan.Table, err, a.Table)}
 		}
 		return &exitError{code: 7, err: fmt.Errorf("teardown failed: %w", err)}
 	}
diff --git a/internal/push/teardown.go b/internal/push/teardown.go
index 5d30ccf..1c270fe 100644
--- a/internal/push/teardown.go
+++ b/internal/push/teardown.go
@@ -5,11 +5,11 @@ import (
 	"context"
 	"fmt"
 	"strings"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
-	"k8s.io/client-go/rest"
 )
 
 // IngestionDatabase is the MySQL schema jobs-manager ingests tables
@@ -49,24 +49,30 @@ type TeardownResult struct {
 	RemovedPaths []string
 }
 
-// Teardown performs the in-cluster teardown described by plan, mirroring
-// the manual kubectl-exec cleanup:
+// Teardown performs the in-cluster teardown described by plan:
 //
 //   - DROP the MySQL table by exec-ing `mysql` inside the mysql pod,
 //     referencing the pod's own $MYSQL_ROOT_PASSWORD — so no database
 //     credential ever transits the CLI.
-//   - rm -rf the PVC dirs by exec-ing inside the jobs-manager pod,
-//     which mounts the shared PVC at SharedRoot.
+//   - rm -rf the PVC dirs from a short-lived pod that mirrors the CLI's
+//     stage pod (uid 65532 + fsGroup 65532, shared PVC mounted) — built
+//     from podOpts via BuildStagePodSpec.
 //
-// DESIGN NOTE: this exec-into-existing-pods approach is the
-// "CLI-direct teardown" (the alternative considered was a server-side
-// jobs-manager delete endpoint). It assumes (a) a pod whose name
-// contains "mysql" exposes $MYSQL_ROOT_PASSWORD, and (b) the
-// jobs-manager pod mounts the shared PVC at SharedRoot — both true for
-// the current parent chart, but worth confirming before this ships.
-func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, namespace string, plan TeardownPlan) (TeardownResult, error) {
+// Why an ephemeral stage-identity pod and NOT the long-lived
+// jobs-manager pod (tracebloc/client#259): the staging files under
+// SharedRoot/.tracebloc-staging/<table> are written by the stage pod as
+// uid 65532 (and the ingestor's SharedRoot/<table> files as uid 65534).
+// The jobs-manager pod runs as a different non-root uid with no shared
+// fsGroup, so its `rm` hit EACCES on 65532-owned files in a
+// non-group-writable directory and left orphans. A teardown pod that
+// runs as the same uid that wrote the staging files OWNS them, so it
+// deletes them by ownership — which works on hostPath (where fsGroup is
+// a no-op, kubernetes/kubernetes#138411) and CSI alike.
+//
+// DESIGN NOTE: still assumes a pod whose name contains "mysql" exposes
+// $MYSQL_ROOT_PASSWORD (true for the current parent chart).
+func Teardown(ctx context.Context, cs kubernetes.Interface, exec Executor, namespace string, plan TeardownPlan, podOpts PodSpecOptions) (TeardownResult, error) {
 	var res TeardownResult
-	exec := &SPDYExecutor{Config: cfg, Client: cs}
 
 	// 1. DROP the table — mysql pod, localhost, its own root password.
 	mysqlPod, mysqlContainer, err := findRunningPod(ctx, cs, namespace, "mysql")
@@ -82,14 +88,28 @@ func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, na
 	}
 	res.DroppedTable = true
 
-	// 2. rm the PVC dirs — jobs-manager pod mounts the shared PVC.
-	jmPod, jmContainer, err := findRunningPod(ctx, cs, namespace, "jobs-manager")
+	// 2. rm the PVC dirs from an ephemeral stage-identity pod (see the
+	//    doc note above + #259). The pod owns the staging files it
+	//    deletes, so this works on hostPath and CSI.
+	podOpts.Namespace = namespace
+	podName, err := CreateStagePod(ctx, cs, podOpts)
 	if err != nil {
-		return res, fmt.Errorf("locating jobs-manager pod: %w", err)
+		return res, fmt.Errorf("creating teardown pod: %w", err)
+	}
+	// Always clean up the teardown pod — even if the wait/exec fails or
+	// the parent ctx is cancelled. Fresh ctx so the delete still reaches
+	// the API. Mirrors push.Stage's deferred cleanup.
+	defer func() {
+		delCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+		_ = DeleteStagePod(delCtx, cs, namespace, podName)
+	}()
+	if _, err := WaitForStagePodReady(ctx, cs, namespace, podName); err != nil {
+		return res, fmt.Errorf("waiting for teardown pod: %w", err)
 	}
 	stderr.Reset()
 	rmCmd := append([]string{"rm", "-rf"}, plan.PVCPaths...)
-	if err := exec.Exec(ctx, namespace, jmPod, jmContainer, rmCmd, nil, nil, &stderr); err != nil {
+	if err := exec.Exec(ctx, namespace, podName, "stage", rmCmd, nil, nil, &stderr); err != nil {
 		return res, fmt.Errorf("removing PVC paths: %w%s", err, stderrSuffix(&stderr))
 	}
 	res.RemovedPaths = plan.PVCPaths
diff --git a/internal/push/teardown_test.go b/internal/push/teardown_test.go
index a3e9444..d7275d7 100644
--- a/internal/push/teardown_test.go
+++ b/internal/push/teardown_test.go
@@ -1,6 +1,15 @@
 package push
 
-import "testing"
+import (
+	"context"
+	"strings"
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+	k8stesting "k8s.io/client-go/testing"
+)
 
 // TestPlanTeardown pins the artifact set `dataset rm` targets: the
 // MySQL table in IngestionDatabase + both PVC dirs (final dest +
@@ -28,3 +37,81 @@ func TestPlanTeardown(t *testing.T) {
 		}
 	}
 }
+
+// TestTeardown_RemovesViaStageIdentityPod is the regression test for
+// tracebloc/client#259: `dataset rm` must NOT run the file `rm` inside
+// the long-lived jobs-manager pod (a non-root uid that cannot delete
+// the uid-65532-owned staging files). It must run it in a short-lived
+// pod that mirrors the stage pod's identity (uid 65532 + fsGroup 65532),
+// which owns the staging files and so can delete them on any volume type
+// (hostPath included, where fsGroup is a no-op).
+func TestTeardown_RemovesViaStageIdentityPod(t *testing.T) {
+	// A running "mysql" pod must exist so step 1 (DROP TABLE) can locate
+	// it. The teardown pod is created by Teardown itself and marked Ready
+	// by the reactor (shared with stage_test.go).
+	cs := fake.NewClientset(&corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "mysql-0", Namespace: "tracebloc"},
+		Spec:       corev1.PodSpec{Containers: []corev1.Container{{Name: "mysql"}}},
+		Status:     corev1.PodStatus{Phase: corev1.PodRunning},
+	})
+	readyOnNextGet(cs)
+	fe := &fakeExecutor{}
+
+	plan := PlanTeardown("reg_train")
+	res, err := Teardown(context.Background(), cs, fe, "tracebloc", plan, PodSpecOptions{
+		Namespace:    "tracebloc",
+		PVCClaimName: "client-pvc",
+		PVCMountPath: "/data/shared",
+		Table:        "reg_train",
+	})
+	if err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+	if !res.DroppedTable {
+		t.Error("DroppedTable = false, want true")
+	}
+
+	// The rm is the LAST Exec call. It must target the ephemeral stage
+	// pod, NOT a jobs-manager pod — that's the #259 fix.
+	if !strings.HasPrefix(fe.gotPod, "tracebloc-stage-") {
+		t.Errorf("rm ran in pod %q, want the ephemeral stage pod (tracebloc-stage-*); "+
+			"running it in the jobs-manager pod is the #259 bug", fe.gotPod)
+	}
+	if strings.Contains(fe.gotPod, "jobs-manager") {
+		t.Errorf("rm ran in the jobs-manager pod (%q) — the #259 regression", fe.gotPod)
+	}
+	if fe.gotContainer != "stage" {
+		t.Errorf("rm container = %q, want stage", fe.gotContainer)
+	}
+	wantCmd := "rm -rf " + strings.Join(plan.PVCPaths, " ")
+	if got := strings.Join(fe.gotCmd, " "); got != wantCmd {
+		t.Errorf("rm cmd = %q, want %q", got, wantCmd)
+	}
+
+	// The teardown pod must run as the stage uid (65532) + fsGroup so it
+	// OWNS the staging files it deletes. Inspect the created Pod via the
+	// fake clientset action log (the pod is deleted before the test ends).
+	var sc *corev1.PodSecurityContext
+	for _, action := range cs.Actions() {
+		if action.GetVerb() == "create" && action.GetResource().Resource == "pods" {
+			sc = action.(k8stesting.CreateAction).GetObject().(*corev1.Pod).Spec.SecurityContext
+			break
+		}
+	}
+	if sc == nil {
+		t.Fatal("no Pod create observed — teardown did not spawn an ephemeral pod")
+	}
+	if sc.RunAsUser == nil || *sc.RunAsUser != 65532 {
+		t.Errorf("teardown pod RunAsUser = %v, want 65532", sc.RunAsUser)
+	}
+	if sc.FSGroup == nil || *sc.FSGroup != 65532 {
+		t.Errorf("teardown pod FSGroup = %v, want 65532", sc.FSGroup)
+	}
+
+	// No leaked teardown pods after a successful run.
+	pods, _ := cs.CoreV1().Pods("tracebloc").List(context.Background(),
+		metav1.ListOptions{LabelSelector: StagePodManagedByLabel + "=" + StagePodManagedByValue})
+	if len(pods.Items) != 0 {
+		t.Errorf("Teardown leaked %d stage pod(s)", len(pods.Items))
+	}
+}

From 6b259cd1d5738e6c85337ed1131df254ae075f99 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Thu, 18 Jun 2026 19:28:12 +0500
Subject: [PATCH 05/15] =?UTF-8?q?feat(#88):=20tracebloc=20cluster=20doctor?=
 =?UTF-8?q?=20=E2=80=94=20live-cluster=20health=20checks=20(WS3)=20(#89)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(#88): tracebloc cluster doctor — live-cluster health checks (WS3)

Adds `tracebloc cluster doctor`, a read-only health sweep of a running
tracebloc client cluster that prints ✔/⚠/✖ per check with a remedy — so a
customer can diagnose "why isn't my experiment running?" without tracebloc
shelling into their cluster (epic client-runtime#116, WS3).

Sibling of `cluster info` (which the code's own comment anticipated); reuses
its kubeconfig/context/namespace flags + cluster.Load / NewClientset /
DiscoverParentRelease, the ui.Printer status vocabulary, and exitError.

Lean MVP — 6 checks:
- cluster reachable      (parent client release discovered)
- pod health            (crash-loops / long-Pending — local complement to #117)
- dataset volume        (shared PVC Bound, via cluster.DiscoverSharedPVC)
- proxy configuration   (in-cluster requests/egress proxy wiring)
- backend egress        (host-side, proxy-aware probe; in-cluster probe = follow-up)
- Service Bus egress    (requests-proxy readiness — the experiments-queue broker)

internal/doctor is a standalone package with injectable network probes,
82% covered via client-go's fake clientset. Every check is independent and
best-effort (one failure never hides the others); the worst status sets the
exit code (0 ok/warn, 2 failures, 3 kubeconfig).

Out of scope (already shipped / follow-up): support-bundle ships as the
installer's `--diagnose`; node-resources-vs-job-request and image-pullability
are the broader cut.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(#88): don't flag Succeeded/recovered pods as crash-looping (Bugbot)

podCrashLooping flagged any pod with RestartCount>=3 — including Succeeded
job pods that retried before completing, and Running pods that recovered
after past restarts — producing a false ✖ when nothing is actually unhealthy.

Guard terminal phases (Succeeded/Failed) and require the container to not be
currently running, mirroring the controller's recovered-container fix
(client-runtime#117). Adds regression tests for both false-positive cases.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(#88): detect init-container crash-loops + nil-release prefixed deploys (Bugbot)

Two Medium Bugbot findings on the previous commit:

- podCrashLooping ignored InitContainerStatuses, so an init container stuck in
  CrashLoopBackOff read as a Pending warning instead of a failure even though
  the pod cannot start. It now checks init + app containers, and detects only
  active CrashLoopBackOff — dropping the RestartCount heuristic entirely, since
  that was the source of the earlier Succeeded/recovered-pod false positives.

- requestsProxyNames/jobsManagerNames only probed unprefixed names when the
  release was nil (e.g. DiscoverParentRelease errored on multiple releases),
  falsely reporting missing wiring even though <release>-requests-proxy exists.
  Added findDeployment: exact-name Get, then a namespace List + name-suffix
  fallback that resolves the prefixed name without knowing the release.

Adds regression tests: init-crash-loop, nil-release-finds-prefixed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs/polish(#88): address Arturo's post-approval review nits

- Tunables comment: they're conservative package consts, not vars; point at
  Options (like HTTPProbe) for any future runtime tuning.
- checkProxy WARN: note that a REQUESTS_PROXY_URL set via a configMap/secret
  ref reads as empty here (jobsManagerEnv reads only literal env).
- httpProbe: a successful connection means reachable — discard the body-close
  error rather than reporting it as "unreachable".

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(#88): suffix fallback must not pick across multiple releases (Bugbot)

findDeployment's suffix fallback (added for the nil-release case) picked the
first suffix-matching deployment, so in a namespace running multiple parent
releases, jobsManagerEnv and checkRequestsProxy could resolve to different
releases in a single run — presenting mixed data as fact.

Resolve the fallback only when exactly one deployment carries the suffix;
with more than one (the multi-release case DiscoverParentRelease already
refuses to disambiguate) return nil and let the check report can't-determine.
The single-release nil-discovery case still resolves.

Adds TestCheckRequestsProxy_NilReleaseAmbiguous.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(#88): tie deployment lookup to the discovered release (Bugbot)

When a release was discovered, findDeployment's fallbacks could still match a
DIFFERENT release's component (or a stray bare one), so the Service Bus check
went green on the wrong requests-proxy while the discovered release's was
missing.

findDeployment now takes the release directly. When it's known, it accepts only
"<release>-<suffix>" or a bare "<suffix>" whose app.kubernetes.io/instance label
ties it to that release — never another release's, never an unattributable bare
one. The release-unknown path keeps the exactly-one-suffix-match rule (returns
nil on >1, so checks report can't-determine rather than guess).

Folds the jobsManagerNames/requestsProxyNames candidate builders into
findDeployment. Adds tests: other-release-ignored, bare-name-tied-by-label.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/cli/cluster.go        |  13 +-
 internal/cli/doctor.go         | 133 +++++++++++
 internal/doctor/doctor.go      | 416 +++++++++++++++++++++++++++++++++
 internal/doctor/doctor_test.go | 341 +++++++++++++++++++++++++++
 4 files changed, 898 insertions(+), 5 deletions(-)
 create mode 100644 internal/cli/doctor.go
 create mode 100644 internal/doctor/doctor.go
 create mode 100644 internal/doctor/doctor_test.go

diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go
index cea1b27..db2cd7a 100644
--- a/internal/cli/cluster.go
+++ b/internal/cli/cluster.go
@@ -13,11 +13,13 @@ import (
 	"github.com/tracebloc/cli/internal/ui"
 )
 
-// newClusterCmd wires the `tracebloc cluster` subtree. Today it has
-// a single verb — `info` — which is the customer's "is the CLI
-// pointing at the right cluster?" pre-flight before running
-// `dataset push`. Future verbs (e.g. `cluster doctor` for
-// diagnostics, `cluster contexts` for switching) hang off this
+// newClusterCmd wires the `tracebloc cluster` subtree:
+//   - `info`   — the customer's "is the CLI pointing at the right
+//     cluster?" pre-flight before running `dataset push`.
+//   - `doctor` — a read-only health sweep of the running release with
+//     ✔/⚠/✖ checks + remedies (epic client-runtime#116, WS3).
+//
+// Future verbs (e.g. `cluster contexts` for switching) hang off this
 // parent in later phases.
 func newClusterCmd() *cobra.Command {
 	cmd := &cobra.Command{
@@ -33,6 +35,7 @@ the wrong cluster).`,
 	}
 
 	cmd.AddCommand(newClusterInfoCmd())
+	cmd.AddCommand(newClusterDoctorCmd())
 	return cmd
 }
 
diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go
new file mode 100644
index 0000000..71a484f
--- /dev/null
+++ b/internal/cli/doctor.go
@@ -0,0 +1,133 @@
+package cli
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/cluster"
+	"github.com/tracebloc/cli/internal/doctor"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// newClusterDoctorCmd implements `tracebloc cluster doctor` — the sibling of
+// `cluster info` that cluster.go's doc comment anticipated. Where `info`
+// answers "is the CLI pointing at the right cluster?", `doctor` answers "is
+// this running cluster healthy enough to run an experiment, and if not, what
+// do I fix?" — a read-only, post-install health sweep with remedies
+// (epic client-runtime#116, WS3).
+//
+// The three kubeconfig flags match `cluster info` exactly so muscle memory
+// carries over; all are zero-value-safe.
+func newClusterDoctorCmd() *cobra.Command {
+	var (
+		kubeconfigPath  string
+		contextOverride string
+		nsOverride      string
+	)
+
+	cmd := &cobra.Command{
+		Use:   "doctor",
+		Short: "Diagnose a running tracebloc client cluster (✔/⚠/✖ health checks + remedies)",
+		Long: `Runs a read-only health sweep over the tracebloc client release in the
+configured cluster + namespace and prints a ✔/⚠/✖ line per check with a
+remedy for anything that isn't green:
+
+  • Cluster reachable — the API answers and the client chart is installed
+  • Pod health — nothing crash-looping or stuck Pending
+  • Dataset volume — the shared PVC exists and is Bound
+  • Proxy configuration — the in-cluster requests/egress proxy wiring
+  • Backend egress — the tracebloc backend is reachable (from this machine)
+  • Service Bus egress — the requests-proxy that brokers experiment egress is Ready
+
+For a full redacted support bundle to send to tracebloc, use the installer's
+` + "`./install-k8s.sh --diagnose`" + ` instead.
+
+Exit codes:
+  0   all checks passed (or warnings only)
+  2   one or more checks failed
+  3   kubeconfig could not be loaded`,
+		Args: cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClusterDoctor(
+				cmd.Context(),
+				printerFor(cmd),
+				kubeconfigPath, contextOverride, nsOverride,
+			)
+		},
+	}
+
+	cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "",
+		"path to the kubeconfig file (default: $KUBECONFIG, then ~/.kube/config)")
+	cmd.Flags().StringVar(&contextOverride, "context", "",
+		"name of the kubeconfig context to use (default: kubeconfig's current-context)")
+	cmd.Flags().StringVarP(&nsOverride, "namespace", "n", "",
+		"namespace where the parent tracebloc/client release is installed (default: the context's namespace, or 'default')")
+
+	return cmd
+}
+
+func runClusterDoctor(
+	ctx context.Context,
+	p *ui.Printer,
+	kubeconfigPath, contextOverride, nsOverride string,
+) error {
+	p.Banner("tracebloc", "cluster doctor")
+
+	resolved, err := cluster.Load(cluster.KubeconfigOptions{
+		Path:      kubeconfigPath,
+		Context:   contextOverride,
+		Namespace: nsOverride,
+	})
+	if err != nil {
+		// 3 = kubeconfig file/parse problem (same class as cluster info).
+		return &exitError{code: 3, err: fmt.Errorf("loading kubeconfig: %w", err)}
+	}
+
+	cs, err := cluster.NewClientset(resolved)
+	if err != nil {
+		return &exitError{code: 3, err: err}
+	}
+
+	p.Section("Kubeconfig")
+	p.Field("context", resolved.Context)
+	p.Field("server", resolved.ServerURL)
+	p.Field("namespace", resolved.Namespace)
+
+	results := doctor.Run(ctx, cs, doctor.Options{Namespace: resolved.Namespace})
+
+	p.Section("Checks")
+	for _, r := range results {
+		switch r.Status {
+		case doctor.StatusOK:
+			p.Successf("%s — %s", r.Name, r.Detail)
+		case doctor.StatusWarn:
+			p.Warnf("%s — %s", r.Name, r.Detail)
+			if r.Remedy != "" {
+				p.Hintf("     %s", r.Remedy)
+			}
+		case doctor.StatusFail:
+			p.Errorf("%s — %s", r.Name, r.Detail)
+			if r.Remedy != "" {
+				p.Hintf("     %s", r.Remedy)
+			}
+		}
+	}
+
+	p.Newline()
+	switch doctor.Worst(results) {
+	case doctor.StatusFail:
+		p.Errorf("Problems found — fix the ✖ items above.")
+		p.Hintf("For deeper triage, send tracebloc a support bundle: ./install-k8s.sh --diagnose")
+		// Silent (err == nil): the per-check lines above already explained it,
+		// so main() shouldn't print a redundant "Error:" line.
+		return &exitError{code: 2, err: nil}
+	case doctor.StatusWarn:
+		p.Warnf("Completed with warnings — review the ⚠ items above.")
+		return nil
+	default:
+		p.Successf("All checks passed — the cluster looks healthy.")
+		return nil
+	}
+}
diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go
new file mode 100644
index 0000000..6e6ee7e
--- /dev/null
+++ b/internal/doctor/doctor.go
@@ -0,0 +1,416 @@
+// Package doctor implements the checks behind `tracebloc cluster doctor`:
+// a read-only, best-effort health sweep of a running tracebloc client
+// cluster. Each check reports ✔/⚠/✖ plus a one-line remedy, so a customer
+// can answer "why isn't my experiment running?" without tracebloc shelling
+// into their cluster (epic tracebloc/client-runtime#116, WS3).
+//
+// Design mirrors the installer's preflight.sh: every check is independent
+// and returns a Result instead of aborting, so one failure never hides the
+// others. Network probes are injectable (Options) so the package is fully
+// exercisable with client-go's fake clientset — no real cluster or egress.
+package doctor
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sort"
+	"strings"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+
+	"github.com/tracebloc/cli/internal/cluster"
+)
+
+// Status is a single check's severity. Ordered so the numerically-greatest
+// status is the worst — see Worst.
+type Status int
+
+const (
+	StatusOK Status = iota
+	StatusWarn
+	StatusFail
+)
+
+func (s Status) String() string {
+	switch s {
+	case StatusOK:
+		return "ok"
+	case StatusWarn:
+		return "warn"
+	case StatusFail:
+		return "fail"
+	default:
+		return "unknown"
+	}
+}
+
+// Result is one check's outcome. Remedy is shown to the customer only when
+// Status is not OK.
+type Result struct {
+	Name   string
+	Status Status
+	Detail string
+	Remedy string
+}
+
+// Worst returns the most severe status across results (StatusOK for none).
+// The doctor command maps a non-OK worst to a non-zero exit code.
+func Worst(results []Result) Status {
+	worst := StatusOK
+	for _, r := range results {
+		if r.Status > worst {
+			worst = r.Status
+		}
+	}
+	return worst
+}
+
+// Conservative tunables, kept as package consts to avoid false positives on a
+// busy cluster. If a check ever needs runtime tuning, thread it through Options
+// (like HTTPProbe) rather than making these package vars.
+const (
+	pendingGrace     = 5 * time.Minute // a pod Pending longer than this is flagged
+	httpProbeTimeout = 8 * time.Second
+)
+
+// Options configures a diagnosis run. The zero value is usable: Namespace
+// defaults are the caller's concern (it passes the resolved namespace), and
+// HTTPProbe falls back to the real proxy-aware prober.
+type Options struct {
+	Namespace string
+
+	// HTTPProbe reports whether a URL is reachable from where the CLI runs.
+	// nil => httpProbe (proxy-aware, short timeout). Injected in tests.
+	HTTPProbe func(ctx context.Context, url string) error
+}
+
+// Run executes every check in display order and returns their results. It
+// never returns an error: an unreachable cluster or a failing probe is a
+// Result, not a Go error — the command renders all of them and derives the
+// exit code from Worst.
+func Run(ctx context.Context, cs kubernetes.Interface, opts Options) []Result {
+	if opts.HTTPProbe == nil {
+		opts.HTTPProbe = httpProbe
+	}
+	ns := opts.Namespace
+
+	// Discovered once and shared: the parent release gates nothing (every
+	// check still runs and reports), but the later checks reuse it.
+	release, relErr := cluster.DiscoverParentRelease(ctx, cs, ns)
+	jmEnv := jobsManagerEnv(ctx, cs, ns, release)
+
+	return []Result{
+		checkReachable(release, relErr, ns),
+		checkPods(ctx, cs, ns),
+		checkPVC(ctx, cs, ns),
+		checkProxy(jmEnv),
+		checkBackendEgress(ctx, jmEnv, opts.HTTPProbe),
+		checkRequestsProxy(ctx, cs, ns, release),
+	}
+}
+
+// checkReachable confirms the API answered and the parent client chart is
+// installed here. It's the gate the customer reads first; the rest still run.
+func checkReachable(release *cluster.ParentRelease, err error, ns string) Result {
+	const name = "Cluster reachable"
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: err.Error(),
+			Remedy: "Check your kubeconfig/context and that the tracebloc client chart is installed here: kubectl get deploy -n " + ns,
+		}
+	}
+	return Result{
+		Name:   name,
+		Status: StatusOK,
+		Detail: fmt.Sprintf("release %q, chart %s, appVersion %s (namespace %s)",
+			release.ReleaseName, release.ChartVersion, release.AppVersion, ns),
+	}
+}
+
+// checkPods flags crash-looping or long-Pending pods — the local complement
+// to the controller's crash-loop detection (client-runtime#117). Conservative
+// thresholds keep a transient restart or a briefly-Pending job from tripping it.
+func checkPods(ctx context.Context, cs kubernetes.Interface, ns string) Result {
+	const name = "Pod health"
+	pods, err := cs.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "could not list pods: " + err.Error(),
+			Remedy: "Ensure your kubeconfig user can list pods in " + ns + ".",
+		}
+	}
+
+	var crashing, pending []string
+	for _, p := range pods.Items {
+		if podCrashLooping(p) {
+			crashing = append(crashing, p.Name)
+			continue
+		}
+		if p.Status.Phase == corev1.PodPending &&
+			time.Since(p.CreationTimestamp.Time) > pendingGrace {
+			pending = append(pending, p.Name)
+		}
+	}
+	sort.Strings(crashing)
+	sort.Strings(pending)
+
+	switch {
+	case len(crashing) > 0:
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("crash-looping: %v", crashing),
+			Remedy: "Inspect the container's own logs: kubectl logs -n " + ns + " <pod> --previous",
+		}
+	case len(pending) > 0:
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: fmt.Sprintf("Pending > %s: %v", pendingGrace, pending),
+			Remedy: "kubectl describe pod -n " + ns + " <pod> — usually unschedulable (resources) or an image pull issue.",
+		}
+	default:
+		return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d pod(s), none crash-looping or stuck Pending", len(pods.Items))}
+	}
+}
+
+// podCrashLooping reports whether a pod has a container actively stuck in
+// CrashLoopBackOff — the state Kubernetes sets for a container that keeps
+// crashing. Both init AND app containers are checked: an init container in
+// CrashLoopBackOff keeps the pod Pending and blocks startup silently (Bugbot
+// on PR #89).
+//
+// We deliberately do NOT infer crash-looping from RestartCount: a high count
+// is equally produced by a pod that recovered on retry, a job that retried
+// before Succeeding, or a completed init container — all healthy (Bugbot on
+// PR #89; cf. the controller's recovered-container fix, client-runtime#117).
+// The terminal-phase guard is belt-and-suspenders: a Succeeded/Failed pod has
+// no waiting containers anyway.
+func podCrashLooping(p corev1.Pod) bool {
+	if p.Status.Phase == corev1.PodSucceeded || p.Status.Phase == corev1.PodFailed {
+		return false
+	}
+	for _, group := range [][]corev1.ContainerStatus{p.Status.InitContainerStatuses, p.Status.ContainerStatuses} {
+		for _, c := range group {
+			if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// checkPVC reuses cluster.DiscoverSharedPVC — which already verifies the
+// shared-data PVC exists and is Bound, with actionable errors.
+func checkPVC(ctx context.Context, cs kubernetes.Interface, ns string) Result {
+	const name = "Dataset volume (PVC)"
+	pvc, err := cluster.DiscoverSharedPVC(ctx, cs, ns)
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: err.Error(),
+			Remedy: "Check the cluster has a usable StorageClass (kubectl get sc) and the PVC is Bound (kubectl get pvc -n " + ns + ").",
+		}
+	}
+	return Result{
+		Name:   name,
+		Status: StatusOK,
+		Detail: fmt.Sprintf("%s Bound, mounted at %s", pvc.ClaimName, pvc.MountPath),
+	}
+}
+
+// checkProxy surfaces the in-cluster proxy wiring read from jobs-manager's
+// env — the corporate-proxy propagation that, when missing, silently strands
+// egress. Informational: present config is ✔; a missing requests-proxy URL is
+// the only genuine anomaly.
+func checkProxy(env map[string]string) Result {
+	const name = "Proxy configuration"
+	rp := env["REQUESTS_PROXY_URL"]
+	if rp == "" {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			// jobsManagerEnv reads only literal env values, so a chart that sets
+			// REQUESTS_PROXY_URL via a configMap/secret ref reads as empty here —
+			// called out in the remedy so a ref-based install isn't mistaken for
+			// missing wiring.
+			Detail: "jobs-manager has no literal REQUESTS_PROXY_URL (chart too old, or it's set via a configMap/secret ref)",
+			Remedy: "Verify the requests-proxy is wired: kubectl set env deploy/<release>-jobs-manager --list | grep PROXY",
+		}
+	}
+	detail := "requests-proxy=" + rp
+	if eg := env["EGRESS_PROXY_URL"]; eg != "" {
+		detail += ", egress-proxy=" + eg
+	}
+	if env["HTTPS_PROXY"] != "" || env["HTTP_PROXY"] != "" {
+		detail += ", corporate HTTP(S)_PROXY set"
+	} else {
+		detail += ", no corporate HTTP(S)_PROXY"
+	}
+	return Result{Name: name, Status: StatusOK, Detail: detail}
+}
+
+// checkBackendEgress probes the tracebloc backend API. Honest scope: this
+// runs from the machine the CLI is on, NOT from inside the cluster — the
+// cluster egresses via its egress-proxy. An in-cluster probe is the WS3
+// follow-up; this still catches a customer network/proxy that can't reach
+// the backend at all.
+func checkBackendEgress(ctx context.Context, env map[string]string, probe func(context.Context, string) error) Result {
+	const name = "Backend egress (from this machine)"
+	host := backendHost(env["CLIENT_ENV"])
+	url := "https://" + host + "/"
+	if err := probe(ctx, url); err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("%s unreachable: %v", host, err),
+			Remedy: "Check this machine's network/proxy to " + host + ". The cluster egresses via its egress-proxy, so this is indicative, not definitive.",
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: host + " reachable"}
+}
+
+// backendHost maps CLIENT_ENV to the backend API host, mirroring the edge
+// runtime's own mapping (controller.py). Unset/unknown defaults to prod, the
+// chart's CLIENT_ENV default.
+func backendHost(clientEnv string) string {
+	switch clientEnv {
+	case "dev":
+		return "dev-api.tracebloc.io"
+	case "stg":
+		return "stg-api.tracebloc.io"
+	default:
+		return "api.tracebloc.io"
+	}
+}
+
+// checkRequestsProxy verifies the requests-proxy deployment — the in-cluster
+// broker for experiment egress (the Service Bus "experiments" queue) — is
+// present and Ready. While it's down, experiments egress fails and the
+// experiment silently stays Pending, the exact class this epic targets.
+func checkRequestsProxy(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result {
+	const name = "Service Bus egress (requests-proxy)"
+	dep := findDeployment(ctx, cs, ns, release, "requests-proxy")
+	if dep == nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: "requests-proxy deployment not found",
+			Remedy: "The requests-proxy brokers experiment (Service Bus) egress; without it experiments stay Pending. Reinstall/upgrade the client chart.",
+		}
+	}
+	if dep.Status.ReadyReplicas < 1 {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("requests-proxy not ready (%d/%d replicas)", dep.Status.ReadyReplicas, dep.Status.Replicas),
+			Remedy: "Experiments egress flows through requests-proxy; while it's down they stay Pending. kubectl describe deploy " + dep.Name + " -n " + ns,
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: "requests-proxy ready (brokers the 'experiments' queue)"}
+}
+
+// jobsManagerEnv reads jobs-manager's first-container plain env into a map
+// (valueFrom entries have no literal value and are skipped). Best-effort:
+// returns an empty map when the deployment can't be fetched.
+func jobsManagerEnv(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) map[string]string {
+	env := map[string]string{}
+	dep := findDeployment(ctx, cs, ns, release, "jobs-manager")
+	if dep == nil || len(dep.Spec.Template.Spec.Containers) == 0 {
+		return env
+	}
+	for _, e := range dep.Spec.Template.Spec.Containers[0].Env {
+		if e.Value != "" {
+			env[e.Name] = e.Value
+		}
+	}
+	return env
+}
+
+// getDeployment returns the first of candidates that exists, or nil.
+func getDeployment(ctx context.Context, cs kubernetes.Interface, ns string, candidates []string) *appsv1.Deployment {
+	for _, n := range candidates {
+		if n == "" {
+			continue
+		}
+		d, err := cs.AppsV1().Deployments(ns).Get(ctx, n, metav1.GetOptions{})
+		if err == nil {
+			return d
+		}
+	}
+	return nil
+}
+
+// findDeployment locates a chart component's Deployment ("<suffix>", e.g.
+// "requests-proxy"), tied to the discovered release so a check can never be
+// satisfied by a DIFFERENT release's component or a stray bare one (Bugbot on
+// PR #89).
+//
+// Release known: take the chart's standard "<release>-<suffix>" name, or a bare
+// "<suffix>" ONLY when its app.kubernetes.io/instance label ties it to this
+// release (older unprefixed charts). A deployment belonging to another release
+// is never accepted — if this release's component is missing, return nil and let
+// the check report it.
+//
+// Release unknown (discovery failed): match by name suffix, but only when
+// EXACTLY ONE deployment carries it. With several (multiple releases, which
+// DiscoverParentRelease refuses to disambiguate) there's no safe attribution, so
+// return nil rather than guess — which would let different checks describe
+// different releases in one run.
+func findDeployment(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease, suffix string) *appsv1.Deployment {
+	if release != nil && release.ReleaseName != "" {
+		if d := getDeployment(ctx, cs, ns, []string{release.ReleaseName + "-" + suffix}); d != nil {
+			return d
+		}
+		if d := getDeployment(ctx, cs, ns, []string{suffix}); d != nil &&
+			d.Labels["app.kubernetes.io/instance"] == release.ReleaseName {
+			return d
+		}
+		return nil
+	}
+
+	deps, err := cs.AppsV1().Deployments(ns).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil
+	}
+	var match *appsv1.Deployment
+	for i := range deps.Items {
+		if n := deps.Items[i].Name; n == suffix || strings.HasSuffix(n, "-"+suffix) {
+			if match != nil {
+				return nil // ambiguous across releases — don't guess
+			}
+			match = &deps.Items[i]
+		}
+	}
+	return match
+}
+
+// httpProbe is the default backend prober: a GET with a short timeout over
+// Go's default transport, which honors HTTP(S)_PROXY/NO_PROXY via
+// http.ProxyFromEnvironment. Any HTTP response (even 4xx/5xx) means the host
+// is reachable — we're testing connectivity, not the endpoint's health.
+func httpProbe(ctx context.Context, url string) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return err
+	}
+	client := &http.Client{Timeout: httpProbeTimeout}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+	// Connected — the host is reachable regardless of status code. Discard the
+	// close error so a rare post-connect close failure isn't reported as "down".
+	_ = resp.Body.Close()
+	return nil
+}
diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go
new file mode 100644
index 0000000..a47fd18
--- /dev/null
+++ b/internal/doctor/doctor_test.go
@@ -0,0 +1,341 @@
+package doctor
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"testing"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+
+	"github.com/tracebloc/cli/internal/cluster"
+)
+
+const ns = "tracebloc"
+
+func bg() context.Context { return context.Background() }
+
+// jobsManagerDep mirrors the chart labels DiscoverParentRelease keys off
+// (see internal/cluster/discover_test.go) so the fake clientset discovers it.
+func jobsManagerDep(release string, env ...corev1.EnvVar) *appsv1.Deployment {
+	return &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      release + "-jobs-manager",
+			Namespace: ns,
+			Labels: map[string]string{
+				"app.kubernetes.io/name":       "client",
+				"app.kubernetes.io/instance":   release,
+				"app.kubernetes.io/managed-by": "Helm",
+				"app.kubernetes.io/version":    "1.3.5",
+				"helm.sh/chart":                "client-1.3.5",
+			},
+		},
+		Spec: appsv1.DeploymentSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{Name: "api", Env: env}},
+				},
+			},
+		},
+	}
+}
+
+func requestsProxyDep(release string, ready int32) *appsv1.Deployment {
+	return &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{Name: release + "-requests-proxy", Namespace: ns},
+		Status:     appsv1.DeploymentStatus{Replicas: 1, ReadyReplicas: ready},
+	}
+}
+
+func boundPVC() *corev1.PersistentVolumeClaim {
+	return &corev1.PersistentVolumeClaim{
+		ObjectMeta: metav1.ObjectMeta{Name: cluster.SharedPVCClaimName, Namespace: ns},
+		Spec:       corev1.PersistentVolumeClaimSpec{AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}},
+		Status:     corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound},
+	}
+}
+
+func runningPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase:             corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: 0}},
+		},
+	}
+}
+
+func crashPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{
+				Name:  "c",
+				State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+			}},
+		},
+	}
+}
+
+// succeededPod is a finished job pod that retried before completing — a high
+// RestartCount here is historical, not a current crash-loop (Bugbot on #89).
+func succeededPod(name string, restarts int32) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase:             corev1.PodSucceeded,
+			ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: restarts}},
+		},
+	}
+}
+
+// recoveredPod restarted several times but its container is running again now —
+// recovered, not crash-looping (cf. controller recovered-container fix, #117).
+func recoveredPod(name string, restarts int32) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{
+				Name:         "c",
+				RestartCount: restarts,
+				State:        corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+			}},
+		},
+	}
+}
+
+func pendingPod(name string, age time.Duration) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              name,
+			Namespace:         ns,
+			CreationTimestamp: metav1.NewTime(time.Now().Add(age)),
+		},
+		Status: corev1.PodStatus{Phase: corev1.PodPending},
+	}
+}
+
+// initCrashPod has an init container stuck in CrashLoopBackOff — the pod stays
+// Pending and cannot start, so it must read as a failure, not a Pending warning
+// (Bugbot on #89).
+func initCrashPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodPending,
+			InitContainerStatuses: []corev1.ContainerStatus{{
+				Name:  "init",
+				State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+			}},
+		},
+	}
+}
+
+func TestWorst(t *testing.T) {
+	if got := Worst(nil); got != StatusOK {
+		t.Fatalf("Worst(nil) = %v, want ok", got)
+	}
+	rs := []Result{{Status: StatusOK}, {Status: StatusFail}, {Status: StatusWarn}}
+	if got := Worst(rs); got != StatusFail {
+		t.Fatalf("Worst = %v, want fail", got)
+	}
+}
+
+func TestCheckReachable(t *testing.T) {
+	if r := checkReachable(nil, errors.New("boom"), ns); r.Status != StatusFail {
+		t.Fatalf("error => %v, want fail", r.Status)
+	}
+	rel := &cluster.ParentRelease{ReleaseName: "tb", ChartVersion: "1.3.5", AppVersion: "1.3.5"}
+	r := checkReachable(rel, nil, ns)
+	if r.Status != StatusOK || !strings.Contains(r.Detail, "tb") {
+		t.Fatalf("release => %v / %q, want ok mentioning the release", r.Status, r.Detail)
+	}
+}
+
+func TestCheckPods(t *testing.T) {
+	tests := []struct {
+		name string
+		pod  *corev1.Pod
+		want Status
+	}{
+		{"healthy", runningPod("ok"), StatusOK},
+		{"crash-loop", crashPod("bad"), StatusFail},
+		{"pending-old", pendingPod("stuck", -10*time.Minute), StatusWarn},
+		{"pending-fresh", pendingPod("fresh", -time.Minute), StatusOK},
+		{"succeeded-high-restarts", succeededPod("done", 5), StatusOK},
+		{"recovered-running", recoveredPod("recovered", 5), StatusOK},
+		{"init-crash-loop", initCrashPod("initbad"), StatusFail},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			cs := fake.NewClientset(tc.pod)
+			if r := checkPods(bg(), cs, ns); r.Status != tc.want {
+				t.Fatalf("checkPods = %v (%q), want %v", r.Status, r.Detail, tc.want)
+			}
+		})
+	}
+}
+
+func TestCheckPVC(t *testing.T) {
+	if r := checkPVC(bg(), fake.NewClientset(boundPVC()), ns); r.Status != StatusOK {
+		t.Fatalf("bound PVC => %v, want ok", r.Status)
+	}
+	if r := checkPVC(bg(), fake.NewClientset(), ns); r.Status != StatusFail {
+		t.Fatalf("missing PVC => %v, want fail", r.Status)
+	}
+}
+
+func TestCheckProxy(t *testing.T) {
+	tests := []struct {
+		name   string
+		env    map[string]string
+		want   Status
+		substr string
+	}{
+		{"requests-proxy set", map[string]string{"REQUESTS_PROXY_URL": "http://requests-proxy-service:8888"}, StatusOK, "requests-proxy="},
+		{"corporate proxy", map[string]string{"REQUESTS_PROXY_URL": "http://x", "HTTPS_PROXY": "http://corp:3128"}, StatusOK, "corporate HTTP(S)_PROXY set"},
+		{"empty", map[string]string{}, StatusWarn, "REQUESTS_PROXY_URL"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			r := checkProxy(tc.env)
+			if r.Status != tc.want || !strings.Contains(r.Detail, tc.substr) {
+				t.Fatalf("checkProxy = %v / %q, want %v containing %q", r.Status, r.Detail, tc.want, tc.substr)
+			}
+		})
+	}
+}
+
+func TestCheckBackendEgress(t *testing.T) {
+	okProbe := func(context.Context, string) error { return nil }
+	failProbe := func(context.Context, string) error { return errors.New("dns failure") }
+
+	if r := checkBackendEgress(bg(), map[string]string{"CLIENT_ENV": "dev"}, okProbe); r.Status != StatusOK || !strings.Contains(r.Detail, "dev-api.tracebloc.io") {
+		t.Fatalf("reachable dev => %v / %q", r.Status, r.Detail)
+	}
+	if r := checkBackendEgress(bg(), map[string]string{}, failProbe); r.Status != StatusFail || !strings.Contains(r.Detail, "api.tracebloc.io") {
+		t.Fatalf("unreachable default => %v / %q", r.Status, r.Detail)
+	}
+}
+
+func TestBackendHost(t *testing.T) {
+	tests := map[string]string{
+		"dev":   "dev-api.tracebloc.io",
+		"stg":   "stg-api.tracebloc.io",
+		"prod":  "api.tracebloc.io",
+		"":      "api.tracebloc.io",
+		"weird": "api.tracebloc.io",
+	}
+	for in, want := range tests {
+		if got := backendHost(in); got != want {
+			t.Errorf("backendHost(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestCheckRequestsProxy(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "tb"}
+	tests := []struct {
+		name string
+		dep  *appsv1.Deployment // nil => deployment absent
+		want Status
+	}{
+		{"ready", requestsProxyDep("tb", 1), StatusOK},
+		{"not-ready", requestsProxyDep("tb", 0), StatusFail},
+		{"missing", nil, StatusFail},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			cs := fake.NewClientset()
+			if tc.dep != nil {
+				cs = fake.NewClientset(tc.dep)
+			}
+			if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != tc.want {
+				t.Fatalf("checkRequestsProxy = %v (%q), want %v", r.Status, r.Detail, tc.want)
+			}
+		})
+	}
+}
+
+// When DiscoverParentRelease failed (release nil) but a release-prefixed
+// requests-proxy exists, the suffix fallback must still find it rather than
+// falsely report it missing (Bugbot on #89).
+func TestCheckRequestsProxy_NilReleaseFindsPrefixed(t *testing.T) {
+	cs := fake.NewClientset(requestsProxyDep("tb", 1)) // "tb-requests-proxy"
+	if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status != StatusOK {
+		t.Fatalf("nil release with prefixed deploy => %v (%q), want ok", r.Status, r.Detail)
+	}
+}
+
+// With multiple parent releases in one namespace (the case DiscoverParentRelease
+// refuses) and no discovered release, the suffix fallback must NOT pick one
+// arbitrarily — guessing could let different checks describe different releases
+// in a single run (Bugbot on #89). It should report can't-determine, not OK.
+func TestCheckRequestsProxy_NilReleaseAmbiguous(t *testing.T) {
+	cs := fake.NewClientset(
+		requestsProxyDep("relA", 1), // "relA-requests-proxy"
+		requestsProxyDep("relB", 1), // "relB-requests-proxy"
+	)
+	if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status == StatusOK {
+		t.Fatalf("ambiguous multi-release => %v (%q), want not-OK (no guessing)", r.Status, r.Detail)
+	}
+}
+
+// With a release discovered, the check must be tied to THAT release: another
+// release's requests-proxy must not be accepted as the discovered release's,
+// or relA goes green on relB's proxy while relA's is actually missing
+// (Bugbot on #89).
+func TestCheckRequestsProxy_DiscoveredReleaseIgnoresOtherReleases(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "relA"} // relA has no requests-proxy
+	cs := fake.NewClientset(requestsProxyDep("relB", 1))
+	if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status == StatusOK {
+		t.Fatalf("relA proxy missing, relB present => %v (%q), want not-OK", r.Status, r.Detail)
+	}
+}
+
+// A bare (unprefixed) requests-proxy is accepted only when its instance label
+// ties it to the discovered release — covering older unprefixed charts.
+func TestCheckRequestsProxy_BareNameAcceptedWhenLabelledForRelease(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "relA"}
+	bare := requestsProxyDep("relA", 1)
+	bare.Name = "requests-proxy"
+	bare.Labels = map[string]string{"app.kubernetes.io/instance": "relA"}
+	cs := fake.NewClientset(bare)
+	if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != StatusOK {
+		t.Fatalf("bare requests-proxy labelled for relA => %v (%q), want ok", r.Status, r.Detail)
+	}
+}
+
+func TestRun_HealthyCluster(t *testing.T) {
+	const rel = "tb"
+	cs := fake.NewClientset(
+		jobsManagerDep(rel,
+			corev1.EnvVar{Name: "REQUESTS_PROXY_URL", Value: "http://requests-proxy-service:8888"},
+			corev1.EnvVar{Name: "CLIENT_ENV", Value: "dev"},
+		),
+		requestsProxyDep(rel, 1),
+		boundPVC(),
+		runningPod("tb-jobs-manager-abc"),
+	)
+
+	results := Run(bg(), cs, Options{
+		Namespace: ns,
+		HTTPProbe: func(context.Context, string) error { return nil },
+	})
+
+	if len(results) != 6 {
+		t.Fatalf("want 6 checks, got %d", len(results))
+	}
+	if w := Worst(results); w != StatusOK {
+		for _, r := range results {
+			t.Logf("%-32s %-4s %s", r.Name, r.Status, r.Detail)
+		}
+		t.Fatalf("healthy cluster worst = %v, want ok", w)
+	}
+}

From b85f52ead243f1216a3ffb8520f9c997adeffce8 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:02:39 +0200
Subject: [PATCH 06/15] =?UTF-8?q?refactor(cli=20P1):=20CategorySpec=20regi?=
 =?UTF-8?q?stry=20=E2=80=94=20one=20source=20for=20category=20dispatch=20(?=
 =?UTF-8?q?#74)=20(#80)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CLI enumerated task categories in four hand-maintained places that had
drifted: the `--category` help listed 5 of 9 (#74), the push accept-gate
hand-listed the supported set twice, the interactive picker kept its own
list, and internal/push/category.go held four separate family maps.

Consolidate into one CategorySpec registry (internal/push/category.go):
each category's family, label, regression-class flag, and CLI-support
status lives in one ordered table. The family predicates
(IsImage/IsTabular/IsText/IsRegressionClass), the `--category` help, the
gate's "Supported:" lists, and the interactive picker now all derive from
it, so the enumerations can't drift apart again. The help now lists all 9
supported categories.

Behaviour-preserving: the gate accepts/rejects exactly the same set
(IsCLISupported == the prior nine-category condition); only the help text
and the now-registry-derived error messages change. semantic_/
instance_segmentation stay known-but-unsupported, each with a per-category
UnsupportedNote.

Adds a registry parity + predicate-derivation test (the anti-drift guard).
First phase of the CLI ingestion consolidation epic (backend#828).

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Co-authored-by: Shujaat Hasan <shujaat@tracebloc.io>
---
 internal/cli/dataset.go                 |  25 ++--
 internal/cli/interactive.go             |  20 +--
 internal/push/category.go               | 180 +++++++++++++++++-------
 internal/push/category_registry_test.go |  99 +++++++++++++
 4 files changed, 245 insertions(+), 79 deletions(-)
 create mode 100644 internal/push/category_registry_test.go

diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go
index 1b63d92..8cb40ba 100644
--- a/internal/cli/dataset.go
+++ b/internal/cli/dataset.go
@@ -222,8 +222,7 @@ Exit codes:
 	cmd.Flags().StringVar(&table, "table", "",
 		"destination table name (MySQL identifier; matches /data/shared/<table>/ on the PVC)")
 	cmd.Flags().StringVar(&category, "category", "image_classification",
-		"task category: image_classification, tabular_classification, tabular_regression, "+
-			"time_series_forecasting, time_to_event_prediction")
+		"task category, one of: "+push.SupportedCategoriesList())
 	cmd.Flags().StringVar(&intent, "intent", "",
 		"intent: train|test")
 	cmd.Flags().StringVar(&labelColumn, "label-column", "",
@@ -422,24 +421,20 @@ contributors train against it without ever seeing the raw files.`))
 	case a.Spec.Category == "":
 		// Left empty by a caller; let the schema produce the canonical
 		// "category is required" error downstream.
-	case push.IsTabular(a.Spec.Category) || push.IsText(a.Spec.Category) ||
-		a.Spec.Category == "image_classification" ||
-		a.Spec.Category == "object_detection" ||
-		a.Spec.Category == "keypoint_detection":
+	case push.IsCLISupported(a.Spec.Category):
 		// supported
 	case push.IsImage(a.Spec.Category):
-		// semantic_segmentation / instance_segmentation
+		// A known image category dataset push doesn't implement yet
+		// (semantic_segmentation / instance_segmentation). The per-category
+		// reason + the supported list both come from the registry.
+		spec, _ := push.Lookup(a.Spec.Category)
 		return &exitError{code: 2, err: fmt.Errorf(
-			"category %q isn't supported by the CLI yet. semantic_segmentation is "+
-				"blocked on the ingestor's mask-sidecar support (data-ingestors#136), and "+
-				"instance_segmentation isn't implemented. Supported image categories: "+
-				"image_classification, object_detection, keypoint_detection.", a.Spec.Category)}
+			"category %q isn't supported by the CLI yet (%s). Supported categories: %s.",
+			a.Spec.Category, spec.UnsupportedNote, push.SupportedCategoriesList())}
 	default:
 		return &exitError{code: 2, err: fmt.Errorf(
-			"category %q isn't a recognized task category. Supported: image_classification, "+
-				"object_detection, keypoint_detection, text_classification, "+
-				"masked_language_modeling, tabular_classification, tabular_regression, "+
-				"time_series_forecasting, time_to_event_prediction.", a.Spec.Category)}
+			"category %q isn't a recognized task category. Supported categories: %s.",
+			a.Spec.Category, push.SupportedCategoriesList())}
 	}
 
 	// 3. Walk the local directory FIRST (local "fail fast"), dispatched
diff --git a/internal/cli/interactive.go b/internal/cli/interactive.go
index 3c567bf..bb6478d 100644
--- a/internal/cli/interactive.go
+++ b/internal/cli/interactive.go
@@ -16,20 +16,12 @@ import (
 )
 
 // promptCategories is the ordered list offered by the interactive
-// category picker — the categories `dataset push` supports today (the
-// same set runDatasetPush's category gate accepts). semantic_ /
-// instance_segmentation are omitted until they're implemented.
-var promptCategories = []string{
-	"image_classification",
-	"object_detection",
-	"keypoint_detection",
-	"text_classification",
-	"masked_language_modeling",
-	"tabular_classification",
-	"tabular_regression",
-	"time_series_forecasting",
-	"time_to_event_prediction",
-}
+// category picker. It derives from the push registry's CLI-supported
+// set — the exact categories runDatasetPush's gate accepts — so the
+// picker can't drift from what `dataset push` actually supports.
+// semantic_/instance_segmentation are excluded (CLISupported=false)
+// until they're implemented.
+var promptCategories = push.SupportedCategoryIDs()
 
 // prompter is the narrow seam over the interactive library. Production
 // uses surveyPrompter (a real terminal); tests inject a fake that
diff --git a/internal/push/category.go b/internal/push/category.go
index c20728a..9ea14c3 100644
--- a/internal/push/category.go
+++ b/internal/push/category.go
@@ -1,72 +1,152 @@
 package push
 
-// Category families. These mirror data-ingestors'
+import "strings"
+
+// CategorySpec is the single source of truth for one task category's
+// CLI-relevant rules. It mirrors data-ingestors'
 // tracebloc_ingestor/cli/conventions.py groupings so the CLI's
-// per-category behaviour (which flags are required, which local
-// layout to expect, which spec fields to emit) stays in lock-step
+// per-category behaviour (which local layout to expect, which spec
+// fields to emit, whether a label policy is needed) stays in lock-step
 // with what the ingestor actually resolves.
 //
-// Kept as a single source of truth here rather than scattered
-// string comparisons across spec.go / dataset.go.
-
-// imageCategories take a labels CSV + an images/ directory (plus,
-// for some, extra sidecar dirs handled in later increments).
-var imageCategories = map[string]bool{
-	"image_classification":  true,
-	"object_detection":      true,
-	"keypoint_detection":    true,
-	"semantic_segmentation": true,
-	"instance_segmentation": true,
+// Everything category-shaped derives from the registry below — the
+// family predicates, the `--category` help text, the interactive
+// picker, and the push accept-gate — so the enumerations can't drift
+// apart (they used to: the flag help listed 5 of 9, cli#74).
+type CategorySpec struct {
+	// ID is the canonical category identifier; it matches the
+	// ingest.v1 schema enum (vendored via scripts/sync-schema.sh).
+	ID string
+	// Family selects the local layout + staging shape.
+	Family Family
+	// Label is the human-friendly name shown in the interactive picker.
+	Label string
+	// RegressionClass marks categories that predict a numeric target and
+	// therefore need label.policy (object label form) so the raw target
+	// never ships to the central backend by default.
+	RegressionClass bool
+	// CLISupported reports whether `dataset push` implements the category
+	// today. semantic_/instance_segmentation are known (the schema
+	// defines them) but not yet pushable.
+	CLISupported bool
+	// UnsupportedNote explains why a known-but-unimplemented category
+	// isn't available yet; surfaced by the push gate. Empty when supported.
+	UnsupportedNote string
+}
+
+// Family groups categories by local layout.
+type Family int
+
+const (
+	// FamilyImage: a labels CSV + an images/ directory (plus, for some,
+	// extra sidecar dirs like annotations/ or masks/).
+	FamilyImage Family = iota
+	// FamilyTabular: a single CSV whose columns are described by a
+	// `schema` (column → SQL type) map. No sidecar files.
+	FamilyTabular
+	// FamilyText: a labels CSV + a directory of text files (texts/ for
+	// classification, sequences/ for masked language modeling).
+	FamilyText
+)
+
+// categoryRegistry is the ordered, authoritative list of every category
+// the ingest.v1 schema defines. Order is the display order for help text
+// and the interactive picker (CLI-supported first, in workflow order;
+// the not-yet-implemented ones last). Adding a category to the schema
+// means adding it here — the parity test pins the set.
+var categoryRegistry = []CategorySpec{
+	{ID: "image_classification", Family: FamilyImage, Label: "Image classification", CLISupported: true},
+	{ID: "object_detection", Family: FamilyImage, Label: "Object detection", CLISupported: true},
+	{ID: "keypoint_detection", Family: FamilyImage, Label: "Keypoint detection", CLISupported: true},
+	{ID: "text_classification", Family: FamilyText, Label: "Text classification", CLISupported: true},
+	{ID: "masked_language_modeling", Family: FamilyText, Label: "Masked language modeling", CLISupported: true},
+	{ID: "tabular_classification", Family: FamilyTabular, Label: "Tabular classification", CLISupported: true},
+	{ID: "tabular_regression", Family: FamilyTabular, Label: "Tabular regression", RegressionClass: true, CLISupported: true},
+	{ID: "time_series_forecasting", Family: FamilyTabular, Label: "Time-series forecasting", RegressionClass: true, CLISupported: true},
+	{ID: "time_to_event_prediction", Family: FamilyTabular, Label: "Time-to-event prediction", RegressionClass: true, CLISupported: true},
+	{ID: "semantic_segmentation", Family: FamilyImage, Label: "Semantic segmentation", CLISupported: false,
+		UnsupportedNote: "blocked on the ingestor's mask-sidecar support (data-ingestors#136)"},
+	{ID: "instance_segmentation", Family: FamilyImage, Label: "Instance segmentation", CLISupported: false,
+		UnsupportedNote: "not implemented"},
+}
+
+// categoryByID indexes the registry for O(1) lookup, built once.
+var categoryByID = func() map[string]CategorySpec {
+	m := make(map[string]CategorySpec, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		m[c.ID] = c
+	}
+	return m
+}()
+
+// Lookup returns the spec for a category id and whether it is known.
+func Lookup(category string) (CategorySpec, bool) {
+	c, ok := categoryByID[category]
+	return c, ok
+}
+
+// IsKnown reports whether category is a recognized task category (in the
+// schema), supported by the CLI or not.
+func IsKnown(category string) bool {
+	_, ok := categoryByID[category]
+	return ok
 }
 
-// tabularCategories take a single CSV whose columns are described by
-// a `schema` (column → SQL type) map. No sidecar files.
-var tabularCategories = map[string]bool{
-	"tabular_classification":   true,
-	"tabular_regression":       true,
-	"time_series_forecasting":  true,
-	"time_to_event_prediction": true,
+// IsCLISupported reports whether `dataset push` implements category today.
+func IsCLISupported(category string) bool { return categoryByID[category].CLISupported }
+
+// IsImage reports whether category uses the labels.csv + images/ layout.
+func IsImage(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyImage
 }
 
-// regressionClassCategories predict a numeric target rather than a
-// class. The schema requires the label in object form with an
-// explicit `policy` so the raw target never ships to the central
-// backend by default (policy=bucket bins it first).
-var regressionClassCategories = map[string]bool{
-	"tabular_regression":       true,
-	"time_series_forecasting":  true,
-	"time_to_event_prediction": true,
+// IsTabular reports whether category uses the single-CSV + schema layout.
+func IsTabular(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyTabular
 }
 
-// textCategories take a labels CSV + a directory of text files
-// (texts/ for classification, sequences/ for masked language
-// modeling). masked_language_modeling additionally needs a
-// tokenizer.json at the dataset root and has NO label.
-var textCategories = map[string]bool{
-	"text_classification":      true,
-	"masked_language_modeling": true,
+// IsText reports whether category uses the labels.csv + text-file dir layout.
+func IsText(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyText
 }
 
-// IsImage reports whether category uses the labels.csv + images/
-// local layout.
-func IsImage(category string) bool { return imageCategories[category] }
+// IsRegressionClass reports whether category predicts a numeric target and
+// therefore needs label.policy (object label form).
+func IsRegressionClass(category string) bool { return categoryByID[category].RegressionClass }
 
-// IsTabular reports whether category uses the single-CSV + schema
-// local layout (no sidecar files).
-func IsTabular(category string) bool { return tabularCategories[category] }
+// SupportedCategoryIDs returns the ids `dataset push` supports, in display
+// order. Used to build the --category help, the interactive picker, and
+// the accept-gate's "Supported:" lists from one place.
+func SupportedCategoryIDs() []string {
+	ids := make([]string, 0, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		if c.CLISupported {
+			ids = append(ids, c.ID)
+		}
+	}
+	return ids
+}
 
-// IsRegressionClass reports whether category predicts a numeric
-// target and therefore needs label.policy (object label form).
-func IsRegressionClass(category string) bool { return regressionClassCategories[category] }
+// AllCategoryIDs returns every recognized category id, in registry order.
+func AllCategoryIDs() []string {
+	ids := make([]string, 0, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		ids = append(ids, c.ID)
+	}
+	return ids
+}
 
-// IsText reports whether category uses the labels.csv + text-file
-// directory (texts/ or sequences/) local layout.
-func IsText(category string) bool { return textCategories[category] }
+// SupportedCategoriesList is the comma-joined supported ids, for help text
+// and gate error messages.
+func SupportedCategoriesList() string { return strings.Join(SupportedCategoryIDs(), ", ") }
 
 // TextSidecarDir returns the sidecar directory name a text category
 // expects: "sequences" for masked_language_modeling, "texts" for
-// text_classification. (Used both as the local subdir to stage and
-// the spec field to emit.)
+// text_classification. (Used both as the local subdir to stage and the
+// spec field to emit.)
 func TextSidecarDir(category string) string {
 	if category == "masked_language_modeling" {
 		return "sequences"
diff --git a/internal/push/category_registry_test.go b/internal/push/category_registry_test.go
new file mode 100644
index 0000000..1bb40ea
--- /dev/null
+++ b/internal/push/category_registry_test.go
@@ -0,0 +1,99 @@
+package push
+
+import (
+	"sort"
+	"testing"
+)
+
+// The registry is the single source of truth; these pin its contents and
+// that the family predicates + the supported set all derive from it, so a
+// future edit can't reintroduce the "5 of 9" drift (cli#74).
+
+func TestRegistryKnownCategories(t *testing.T) {
+	want := []string{
+		"image_classification", "object_detection", "keypoint_detection",
+		"semantic_segmentation", "instance_segmentation",
+		"text_classification", "masked_language_modeling",
+		"tabular_classification", "tabular_regression",
+		"time_series_forecasting", "time_to_event_prediction",
+	}
+	if got := AllCategoryIDs(); !equalSet(got, want) {
+		t.Fatalf("AllCategoryIDs() = %v, want set %v", got, want)
+	}
+	for _, id := range want {
+		if !IsKnown(id) {
+			t.Errorf("IsKnown(%q) = false, want true", id)
+		}
+	}
+	if IsKnown("not_a_category") {
+		t.Error(`IsKnown("not_a_category") = true, want false`)
+	}
+}
+
+func TestSupportedCategories(t *testing.T) {
+	got := SupportedCategoryIDs()
+	if len(got) != 9 {
+		t.Fatalf("SupportedCategoryIDs() len = %d, want 9: %v", len(got), got)
+	}
+	for _, id := range got {
+		if !IsCLISupported(id) {
+			t.Errorf("SupportedCategoryIDs returned %q but IsCLISupported is false", id)
+		}
+	}
+	// semantic_/instance_segmentation are known but not yet pushable, and
+	// must explain why.
+	for _, id := range []string{"semantic_segmentation", "instance_segmentation"} {
+		if !IsKnown(id) {
+			t.Errorf("%s should be known", id)
+		}
+		if IsCLISupported(id) {
+			t.Errorf("%s should not be CLI-supported yet", id)
+		}
+		if spec, _ := Lookup(id); spec.UnsupportedNote == "" {
+			t.Errorf("%s should carry an UnsupportedNote", id)
+		}
+	}
+}
+
+func TestPredicatesDeriveFromRegistry(t *testing.T) {
+	for _, c := range categoryRegistry {
+		switch c.Family {
+		case FamilyImage:
+			if !IsImage(c.ID) || IsTabular(c.ID) || IsText(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyImage", c.ID)
+			}
+		case FamilyTabular:
+			if !IsTabular(c.ID) || IsImage(c.ID) || IsText(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyTabular", c.ID)
+			}
+		case FamilyText:
+			if !IsText(c.ID) || IsImage(c.ID) || IsTabular(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyText", c.ID)
+			}
+		}
+		if IsRegressionClass(c.ID) != c.RegressionClass {
+			t.Errorf("%s: IsRegressionClass = %v, want %v", c.ID, IsRegressionClass(c.ID), c.RegressionClass)
+		}
+	}
+	// An unknown category: every predicate false (no panic on missing key).
+	if IsImage("nope") || IsTabular("nope") || IsText("nope") ||
+		IsRegressionClass("nope") || IsCLISupported("nope") {
+		t.Error("predicates should all be false for an unknown category")
+	}
+}
+
+func equalSet(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	as := append([]string(nil), a...)
+	bs := append([]string(nil), b...)
+	sort.Strings(as)
+	sort.Strings(bs)
+	for i := range as {
+		if as[i] != bs[i] {
+			return false
+		}
+	}
+	return true
+}

From 393edc021188243198dd74ab5a9192f5c5045311 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Fri, 19 Jun 2026 12:08:18 +0500
Subject: [PATCH 07/15] =?UTF-8?q?feat(#90):=20cluster=20doctor=20=E2=80=94?=
 =?UTF-8?q?=20node-fit=20+=20image-pull=20checks=20(WS3=20follow-up)=20(#9?=
 =?UTF-8?q?1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(#90): cluster doctor — node-fit + image-pull checks (WS3 follow-up)

Two read-only checks added to `tracebloc cluster doctor` (follow-up to #89):

- Node capacity: parses the resource requests jobs-manager stamps on spawned
  training jobs (RESOURCE_REQUESTS / GPU_REQUESTS env) and checks at least one
  Ready node can fit them — the "Pending forever, no node big enough" class.
  GPU is soft: a hard ✖ only on cpu/mem, and a ⚠ when a GPU is requested but no
  node exposes it (jobs-manager has a GPU->CPU fallback).
- Image pull secret: when jobs-manager references a registry pull secret,
  verifies it exists and is a well-formed dockerconfigjson so private-image
  pulls don't ImagePullBackOff.

Both read-only/best-effort, tested with client-go's fake clientset. The
in-cluster egress probe (the third deferred check on #90) is intentionally a
separate PR — it needs a port-forward/exec mechanism, not this read-only pattern.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(#90): node-fit must require cpu+mem+GPU on ONE node (Bugbot)

checkNodeFit set cpuMemFits and gpuFits independently, so they could come from
different nodes — reporting OK even when no single node had cpu+memory+GPU
together (a GPU job would then stay Pending). It now evaluates each node as a
whole: cpuMemFits (any node) drives the hard fail; fullFits (one node with
cpu+mem AND the GPU) drives the ok/warn split. Adds regression tests for the
cross-node and single-node-fits cases.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/doctor/doctor.go      | 183 +++++++++++++++++++++++++++++++++
 internal/doctor/doctor_test.go | 159 +++++++++++++++++++++++++++-
 2 files changed, 340 insertions(+), 2 deletions(-)

diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go
index 6e6ee7e..fdcc4ec 100644
--- a/internal/doctor/doctor.go
+++ b/internal/doctor/doctor.go
@@ -12,6 +12,7 @@ package doctor
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"net/http"
 	"sort"
@@ -20,6 +21,7 @@ import (
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 
@@ -108,6 +110,8 @@ func Run(ctx context.Context, cs kubernetes.Interface, opts Options) []Result {
 		checkReachable(release, relErr, ns),
 		checkPods(ctx, cs, ns),
 		checkPVC(ctx, cs, ns),
+		checkNodeFit(ctx, cs, jmEnv),
+		checkImagePull(ctx, cs, ns, release),
 		checkProxy(jmEnv),
 		checkBackendEgress(ctx, jmEnv, opts.HTTPProbe),
 		checkRequestsProxy(ctx, cs, ns, release),
@@ -320,6 +324,185 @@ func checkRequestsProxy(ctx context.Context, cs kubernetes.Interface, ns string,
 	return Result{Name: name, Status: StatusOK, Detail: "requests-proxy ready (brokers the 'experiments' queue)"}
 }
 
+// checkNodeFit verifies at least one Ready node can satisfy the resource
+// requests the jobs-manager stamps on spawned training jobs (RESOURCE_REQUESTS
+// / GPU_REQUESTS env) — the "Pending forever, no node big enough" class. GPU is
+// soft: when a GPU is requested but no node exposes it, that's a ⚠ (jobs-manager
+// has a GPU→CPU fallback), not a hard failure.
+func checkNodeFit(ctx context.Context, cs kubernetes.Interface, env map[string]string) Result {
+	const name = "Node capacity"
+	cpuReq, memReq, ok := parseCPUMem(env["RESOURCE_REQUESTS"])
+	if !ok {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "couldn't read RESOURCE_REQUESTS from jobs-manager — skipping node-fit",
+			Remedy: "kubectl set env deploy/<release>-jobs-manager --list | grep RESOURCE_REQUESTS",
+		}
+	}
+	gpuName, gpuReq, gpuRequested := parseGPU(env["GPU_REQUESTS"])
+
+	nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "could not list nodes: " + err.Error(),
+			Remedy: "Ensure your kubeconfig user can list nodes.",
+		}
+	}
+
+	req := fmt.Sprintf("cpu=%s, memory=%s", cpuReq.String(), memReq.String())
+	if gpuRequested {
+		req += fmt.Sprintf(", %s=%s", gpuName, gpuReq.String())
+	}
+
+	// A pod gets ALL its requested resources from ONE node, so evaluate each
+	// node as a whole — never OR cpu/mem and GPU across different nodes, which
+	// would pass even when no single node can run the job (Bugbot on PR #91).
+	var cpuMemFits, fullFits bool
+	for i := range nodes.Items {
+		n := nodes.Items[i]
+		if !nodeReady(n) {
+			continue
+		}
+		alloc := n.Status.Allocatable
+		nodeCPUMem := alloc.Cpu().Cmp(cpuReq) >= 0 && alloc.Memory().Cmp(memReq) >= 0
+		nodeGPU := !gpuRequested
+		if gpuRequested {
+			if q, present := alloc[gpuName]; present && q.Cmp(gpuReq) >= 0 {
+				nodeGPU = true
+			}
+		}
+		if nodeCPUMem {
+			cpuMemFits = true
+		}
+		if nodeCPUMem && nodeGPU {
+			fullFits = true
+		}
+	}
+
+	switch {
+	case !cpuMemFits:
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("no Ready node can fit a training job (needs %s)", req),
+			Remedy: "Add/resize a node to meet the job's requests, or lower RESOURCE_REQUESTS on jobs-manager.",
+		}
+	case gpuRequested && !fullFits:
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: fmt.Sprintf("no single Ready node satisfies cpu+memory AND %s — GPU jobs rely on the CPU fallback (needs %s)", gpuName, req),
+			Remedy: "If GPU training is expected, ensure one node has both the compute and the GPU capacity, with its device plugin.",
+		}
+	default:
+		return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("a Ready node can schedule a training job (%s)", req)}
+	}
+}
+
+// checkImagePull verifies that any registry pull secret the jobs-manager
+// references exists and is a well-formed dockerconfigjson — so private-image
+// pulls don't ImagePullBackOff. (Bad-but-well-formed credentials can't be
+// detected without an actual pull; this catches a missing/empty/malformed
+// secret, the common misconfig.)
+func checkImagePull(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result {
+	const name = "Image pull secret"
+	dep := findDeployment(ctx, cs, ns, release, "jobs-manager")
+	if dep == nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "couldn't read jobs-manager to resolve image pull secrets — skipping",
+			Remedy: "Check the parent client release is installed in " + ns + ".",
+		}
+	}
+	secrets := dep.Spec.Template.Spec.ImagePullSecrets
+	if len(secrets) == 0 {
+		return Result{Name: name, Status: StatusOK, Detail: "no image pull secret in use (public/digest-pinned images)"}
+	}
+	for _, ref := range secrets {
+		sec, err := cs.CoreV1().Secrets(ns).Get(ctx, ref.Name, metav1.GetOptions{})
+		if err != nil {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("image pull secret %q not found", ref.Name),
+				Remedy: "Private image pulls will ImagePullBackOff. Reinstall the chart with valid registry credentials.",
+			}
+		}
+		if sec.Type != corev1.SecretTypeDockerConfigJson {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("secret %q is type %q, not %s", ref.Name, sec.Type, corev1.SecretTypeDockerConfigJson),
+				Remedy: "Recreate it as a docker-registry secret (kubectl create secret docker-registry).",
+			}
+		}
+		if data := sec.Data[corev1.DockerConfigJsonKey]; len(data) == 0 || !json.Valid(data) {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("secret %q has an empty or malformed %s", ref.Name, corev1.DockerConfigJsonKey),
+				Remedy: "Recreate the registry secret; its .dockerconfigjson isn't valid JSON.",
+			}
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d image pull secret(s) present and well-formed", len(secrets))}
+}
+
+// parseResourceSpec parses jobs-manager's "k1=v1,k2=v2" resource env into a map.
+func parseResourceSpec(spec string) map[string]string {
+	out := map[string]string{}
+	for _, part := range strings.Split(spec, ",") {
+		kv := strings.SplitN(strings.TrimSpace(part), "=", 2)
+		if len(kv) == 2 && strings.TrimSpace(kv[0]) != "" {
+			out[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
+		}
+	}
+	return out
+}
+
+// parseCPUMem extracts the cpu + memory quantities from a RESOURCE_REQUESTS
+// spec; ok is false unless both are present and parseable.
+func parseCPUMem(spec string) (cpu, mem resource.Quantity, ok bool) {
+	m := parseResourceSpec(spec)
+	c, cOK := m["cpu"]
+	mm, mOK := m["memory"]
+	if !cOK || !mOK {
+		return resource.Quantity{}, resource.Quantity{}, false
+	}
+	cpu, errC := resource.ParseQuantity(c)
+	mem, errM := resource.ParseQuantity(mm)
+	if errC != nil || errM != nil {
+		return resource.Quantity{}, resource.Quantity{}, false
+	}
+	return cpu, mem, true
+}
+
+// parseGPU extracts the GPU resource name + quantity from a GPU_REQUESTS spec
+// (e.g. "nvidia.com/gpu=1"). requested is false when absent, unparseable, or 0.
+func parseGPU(spec string) (name corev1.ResourceName, qty resource.Quantity, requested bool) {
+	for k, v := range parseResourceSpec(spec) {
+		q, err := resource.ParseQuantity(v)
+		if err == nil && !q.IsZero() {
+			return corev1.ResourceName(k), q, true
+		}
+	}
+	return "", resource.Quantity{}, false
+}
+
+// nodeReady reports whether a node's Ready condition is True.
+func nodeReady(n corev1.Node) bool {
+	for _, c := range n.Status.Conditions {
+		if c.Type == corev1.NodeReady {
+			return c.Status == corev1.ConditionTrue
+		}
+	}
+	return false
+}
+
 // jobsManagerEnv reads jobs-manager's first-container plain env into a map
 // (valueFrom entries have no literal value and are skipped). Best-effort:
 // returns an empty map when the deployment can't be fetched.
diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go
index a47fd18..766fe93 100644
--- a/internal/doctor/doctor_test.go
+++ b/internal/doctor/doctor_test.go
@@ -9,6 +9,7 @@ import (
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/fake"
 
@@ -318,10 +319,12 @@ func TestRun_HealthyCluster(t *testing.T) {
 		jobsManagerDep(rel,
 			corev1.EnvVar{Name: "REQUESTS_PROXY_URL", Value: "http://requests-proxy-service:8888"},
 			corev1.EnvVar{Name: "CLIENT_ENV", Value: "dev"},
+			corev1.EnvVar{Name: "RESOURCE_REQUESTS", Value: "cpu=2,memory=8Gi"},
 		),
 		requestsProxyDep(rel, 1),
 		boundPVC(),
 		runningPod("tb-jobs-manager-abc"),
+		node("n1", "4", "16Gi"),
 	)
 
 	results := Run(bg(), cs, Options{
@@ -329,8 +332,8 @@ func TestRun_HealthyCluster(t *testing.T) {
 		HTTPProbe: func(context.Context, string) error { return nil },
 	})
 
-	if len(results) != 6 {
-		t.Fatalf("want 6 checks, got %d", len(results))
+	if len(results) != 8 {
+		t.Fatalf("want 8 checks, got %d", len(results))
 	}
 	if w := Worst(results); w != StatusOK {
 		for _, r := range results {
@@ -339,3 +342,155 @@ func TestRun_HealthyCluster(t *testing.T) {
 		t.Fatalf("healthy cluster worst = %v, want ok", w)
 	}
 }
+
+func node(name, cpu, mem string, gpu ...string) *corev1.Node {
+	alloc := corev1.ResourceList{
+		corev1.ResourceCPU:    resource.MustParse(cpu),
+		corev1.ResourceMemory: resource.MustParse(mem),
+	}
+	if len(gpu) == 2 {
+		alloc[corev1.ResourceName(gpu[0])] = resource.MustParse(gpu[1])
+	}
+	return &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{Name: name},
+		Status: corev1.NodeStatus{
+			Allocatable: alloc,
+			Conditions:  []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionTrue}},
+		},
+	}
+}
+
+func TestParseCPUMem(t *testing.T) {
+	cpu, mem, ok := parseCPUMem("cpu=2,memory=8Gi")
+	if !ok || cpu.String() != "2" || mem.String() != "8Gi" {
+		t.Fatalf("parseCPUMem => %q %q %v", cpu.String(), mem.String(), ok)
+	}
+	if _, _, ok := parseCPUMem("cpu=2"); ok {
+		t.Fatalf("missing memory should be !ok")
+	}
+	if _, _, ok := parseCPUMem("cpu=abc,memory=8Gi"); ok {
+		t.Fatalf("unparseable cpu should be !ok")
+	}
+}
+
+func TestParseGPU(t *testing.T) {
+	name, qty, req := parseGPU("nvidia.com/gpu=1")
+	if !req || string(name) != "nvidia.com/gpu" || qty.String() != "1" {
+		t.Fatalf("parseGPU => %q %q %v", name, qty.String(), req)
+	}
+	if _, _, req := parseGPU("nvidia.com/gpu=0"); req {
+		t.Fatalf("zero gpu should be !requested")
+	}
+	if _, _, req := parseGPU(""); req {
+		t.Fatalf("empty should be !requested")
+	}
+}
+
+func TestCheckNodeFit(t *testing.T) {
+	full := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi", "GPU_REQUESTS": "nvidia.com/gpu=1"}
+	cpuOnly := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi"}
+
+	t.Run("fits cpu+mem+gpu", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi", "nvidia.com/gpu", "2"))
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("no node big enough -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "1", "2Gi"))
+		if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+	t.Run("gpu requested but none -> warn", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi")) // cpu/mem fit, no gpu
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail)
+		}
+	})
+	t.Run("cpu+mem and gpu on different nodes -> warn, not ok", func(t *testing.T) {
+		// The Bugbot #91 case: one node fits cpu/mem, a different node has the
+		// GPU but is too small. No single node runs a GPU job → must NOT be ok.
+		cs := fake.NewClientset(
+			node("big", "4", "16Gi"),                       // cpu/mem, no gpu
+			node("gpu", "1", "1Gi", "nvidia.com/gpu", "2"), // gpu, too small
+		)
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn (no single node fits all)", r.Status, r.Detail)
+		}
+	})
+	t.Run("single node fits cpu+mem+gpu -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(
+			node("big", "4", "16Gi"),                         // distractor: cpu/mem only
+			node("full", "4", "16Gi", "nvidia.com/gpu", "1"), // satisfies everything
+		)
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("not-ready node doesn't count -> fail", func(t *testing.T) {
+		n := node("n1", "8", "32Gi")
+		n.Status.Conditions = []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionFalse}}
+		cs := fake.NewClientset(n)
+		if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail (node not ready)", r.Status, r.Detail)
+		}
+	})
+	t.Run("missing RESOURCE_REQUESTS -> warn", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi"))
+		if r := checkNodeFit(bg(), cs, map[string]string{}); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail)
+		}
+	})
+}
+
+func dockerSecret(name string, data []byte) *corev1.Secret {
+	return &corev1.Secret{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Type:       corev1.SecretTypeDockerConfigJson,
+		Data:       map[string][]byte{corev1.DockerConfigJsonKey: data},
+	}
+}
+
+func jmDepWithPullSecret(release, secretName string) *appsv1.Deployment {
+	d := jobsManagerDep(release)
+	if secretName != "" {
+		d.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: secretName}}
+	}
+	return d
+}
+
+func TestCheckImagePull(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "tb"}
+
+	t.Run("no pull secret -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(jmDepWithPullSecret("tb", ""))
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("valid dockerconfigjson -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(
+			jmDepWithPullSecret("tb", "reg"),
+			dockerSecret("reg", []byte(`{"auths":{}}`)),
+		)
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("missing secret -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(jmDepWithPullSecret("tb", "reg")) // secret absent
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+	t.Run("malformed dockerconfigjson -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(
+			jmDepWithPullSecret("tb", "reg"),
+			dockerSecret("reg", []byte("not json")),
+		)
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+}

From e3226134f4bb279af316cbe9f095bef543596ade Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:10:50 +0200
Subject: [PATCH 08/15] =?UTF-8?q?feat(cli):=20auth=20scaffold=20=E2=80=94?=
 =?UTF-8?q?=20login/logout/auth=20status=20+=20config=20+=20backend=20clie?=
 =?UTF-8?q?nt=20(cli#83)=20(#85)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(cli): auth scaffold — login/logout/auth status + config + backend client (cli#83)

RFC-0001 (backend#830) Phase-1 CLI side, scaffolded ahead of the backend
device-grant so it activates the moment backend#835 ships.

- internal/config: ~/.tracebloc config store (0600, atomic write) — backend
  env + user token + active client. Fully functional + unit-tested.
- internal/api: backend REST client. CLIENT_ENV -> {dev,stg,prod} base URL
  (matches the installer's _backend_url); proxy + CA aware (honors
  HTTP(S)_PROXY / NO_PROXY + the system cert pool, for corporate-proxy
  networks); the RFC 8628 device-flow methods (RequestDeviceCode + PollToken
  with the authorization_pending / slow_down / expired_token / access_denied
  states). Unit-tested via httptest.
- internal/cli: `tracebloc login` (device flow — show URL + code, poll, store
  the token), `logout`, `auth status`. `client create/list/use` are stubbed
  (cli#84 — they need the user token from login + provisioning backend#836).

login calls /device/code + /device/token, which land in backend#835; until
then it reports that the backend doesn't support browser sign-in yet. Builds,
gofmt-clean, unit-tested (config round-trip + 0600 mode; api URL map +
device-flow poll states); `tracebloc --help` lists the new verbs.

Part of cli#83 / backend#830 (end-to-end login activates with backend#835).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* feat(cli): authenticate with Bearer + verify token on login (cli#83) (#86)

* feat(cli): authenticate with Bearer + verify token on login (cli#83)

Completes `tracebloc login` against the now-built device-grant endpoints
(backend#846). The token the flow issues is a ClientAccessToken, which the
backend authenticates as `Authorization: Bearer` (ClientAccessTokenAuthentication,
backend#835) — not the legacy DRF `Token` scheme the client was sending on
authenticated requests, which would have failed to authenticate a logged-in token.

- internal/api: authenticated requests now send `Bearer <token>` (was `Token`);
  add get() + WhoAmI() (GET /userinfo/) to confirm the token + fetch the account.
- login now verifies the freshly-issued token (best-effort) and stores/shows the
  account ("Signed in as you@co.com"); a failed lookup never fails a valid sign-in.
- tests: WhoAmI sends Bearer + parses the identity; a 401 surfaces as an APIError.

The device-flow contract (paths/fields/error codes) was already aligned with
backend#846 — verified, unchanged. Stacked on cli#85 (auth scaffold).
go build/vet/test green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* test(cli): cover the login command end-to-end + add test seams (cli#83)

internal/api was unit-tested, but the login / logout / auth status COMMANDS
weren't. Adds auth_test.go driving the full device-flow command against an
httptest backend whose shapes match backend#846 — so it also guards the
CLI<->backend contract that the Token->Bearer fix corrected:

- login: device_code -> authorization_pending -> token -> WhoAmI(Bearer) ->
  "Signed in as ..." with config persisted; the 404 "unsupported backend" gate
  (asserts no token is stored); access_denied.
- logout clears the token; auth status (signed-in + not-signed-in).

Two unexported test seams in auth.go — newAPIClient (point at an httptest
server) and pollAfter (fire the poll immediately) — since the flow otherwise
makes real HTTP calls on a timer. go build / vet / test green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/api/client.go         | 251 +++++++++++++++++++++++++++++++++
 internal/api/client_test.go    | 140 ++++++++++++++++++
 internal/cli/auth.go           | 205 +++++++++++++++++++++++++++
 internal/cli/auth_test.go      | 163 +++++++++++++++++++++
 internal/cli/client.go         |  75 ++++++++++
 internal/cli/root.go           |   5 +
 internal/config/config.go      | 122 ++++++++++++++++
 internal/config/config_test.go |  64 +++++++++
 8 files changed, 1025 insertions(+)
 create mode 100644 internal/api/client.go
 create mode 100644 internal/api/client_test.go
 create mode 100644 internal/cli/auth.go
 create mode 100644 internal/cli/auth_test.go
 create mode 100644 internal/cli/client.go
 create mode 100644 internal/config/config.go
 create mode 100644 internal/config/config_test.go

diff --git a/internal/api/client.go b/internal/api/client.go
new file mode 100644
index 0000000..0bb22d3
--- /dev/null
+++ b/internal/api/client.go
@@ -0,0 +1,251 @@
+// Package api is the tracebloc CLI's client for the central backend REST API
+// (browser login + client provisioning). It is distinct from internal/submit,
+// which talks to the in-cluster jobs-manager: this one reaches the public
+// backend over real TLS. RFC-0001 (backend#830); the device-flow endpoints
+// (/device/code, /device/token) land in backend#835, so RequestDeviceCode /
+// PollToken are written against the RFC 8628 spec and go live when that ships.
+package api
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+)
+
+// Backend environments (mirror CLIENT_ENV).
+const (
+	EnvDev  = "dev"
+	EnvStg  = "stg"
+	EnvProd = "prod"
+)
+
+const defaultTimeout = 30 * time.Second
+
+// BaseURL maps a CLIENT_ENV value to the backend base URL — kept in lock-step
+// with the installer's `_backend_url` and client-runtime's CLIENT_ENV→backend
+// mapping. Unknown / empty → prod.
+func BaseURL(env string) string {
+	switch strings.ToLower(env) {
+	case EnvDev:
+		return "https://dev-api.tracebloc.io"
+	case EnvStg:
+		return "https://stg-api.tracebloc.io"
+	default:
+		return "https://api.tracebloc.io"
+	}
+}
+
+// ResolveEnv picks the backend env: an explicit value (a --env flag) wins,
+// then $CLIENT_ENV, then prod.
+func ResolveEnv(explicit string) string {
+	if explicit != "" {
+		return strings.ToLower(explicit)
+	}
+	if e := os.Getenv("CLIENT_ENV"); e != "" {
+		return strings.ToLower(e)
+	}
+	return EnvProd
+}
+
+// Client talks to the backend REST API. Token (the user token from login) is
+// optional: the device-flow endpoints are unauthenticated; provisioning calls
+// set it.
+type Client struct {
+	BaseURL string
+	Token   string
+	HTTP    *http.Client
+}
+
+// New returns a Client for the given env. The HTTP client is proxy- and
+// CA-aware — it honors HTTP(S)_PROXY/NO_PROXY and the system cert pool —
+// because RFC-0001 must work behind a corporate / TLS-inspecting proxy
+// (backend#830 Q1). Unlike internal/submit (in-cluster, no real TLS) this
+// verifies certificates: it's the public backend.
+func New(env string) *Client {
+	return &Client{
+		BaseURL: strings.TrimRight(BaseURL(env), "/"),
+		HTTP: &http.Client{
+			Timeout:   defaultTimeout,
+			Transport: &http.Transport{Proxy: http.ProxyFromEnvironment},
+		},
+	}
+}
+
+// APIError is a non-2xx response, with the remote body surfaced verbatim.
+type APIError struct {
+	StatusCode int
+	Body       string
+	URL        string
+}
+
+func (e *APIError) Error() string {
+	return fmt.Sprintf("%s returned HTTP %d: %s", e.URL, e.StatusCode, strings.TrimSpace(e.Body))
+}
+
+// post sends an optional JSON body and returns the status code + raw response.
+func (c *Client) post(ctx context.Context, path string, body any) (int, []byte, error) {
+	var rdr io.Reader
+	if body != nil {
+		b, err := json.Marshal(body)
+		if err != nil {
+			return 0, nil, fmt.Errorf("encoding request: %w", err)
+		}
+		rdr = bytes.NewReader(b)
+	}
+	url := c.BaseURL + path
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, rdr)
+	if err != nil {
+		return 0, nil, fmt.Errorf("building request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	c.setAuth(req)
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return 0, nil, fmt.Errorf("POST %s: %w", url, err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+	raw, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err)
+	}
+	return resp.StatusCode, raw, nil
+}
+
+// setAuth attaches the stored token as a Bearer credential. The login token is
+// a ClientAccessToken, authenticated by the backend's
+// ClientAccessTokenAuthentication (keyword "Bearer", backend#835) — NOT the
+// legacy DRF "Token" scheme.
+func (c *Client) setAuth(req *http.Request) {
+	if c.Token != "" {
+		req.Header.Set("Authorization", "Bearer "+c.Token)
+	}
+}
+
+// get sends an authenticated GET and returns the status code + raw response.
+func (c *Client) get(ctx context.Context, path string) (int, []byte, error) {
+	url := c.BaseURL + path
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return 0, nil, fmt.Errorf("building request: %w", err)
+	}
+	c.setAuth(req)
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return 0, nil, fmt.Errorf("GET %s: %w", url, err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+	raw, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err)
+	}
+	return resp.StatusCode, raw, nil
+}
+
+// ── Device Authorization Grant (RFC 8628) — backend endpoints land in #835 ──
+
+// DeviceCodeResponse is the reply from POST /device/code.
+type DeviceCodeResponse struct {
+	DeviceCode              string `json:"device_code"`
+	UserCode                string `json:"user_code"`
+	VerificationURI         string `json:"verification_uri"`
+	VerificationURIComplete string `json:"verification_uri_complete"`
+	ExpiresIn               int    `json:"expires_in"`
+	Interval                int    `json:"interval"`
+}
+
+// RequestDeviceCode starts the device flow.
+func (c *Client) RequestDeviceCode(ctx context.Context) (*DeviceCodeResponse, error) {
+	url := c.BaseURL + "/device/code"
+	status, raw, err := c.post(ctx, "/device/code", nil)
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out DeviceCodeResponse
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, fmt.Errorf("decoding device-code response: %w", err)
+	}
+	if out.DeviceCode == "" || out.UserCode == "" {
+		return nil, fmt.Errorf("device-code response missing device_code/user_code (got %q)", string(raw))
+	}
+	return &out, nil
+}
+
+// Device-flow poll outcomes (RFC 8628 §3.5): pending / slow_down mean "keep
+// polling"; expired_token / access_denied are terminal.
+var (
+	ErrAuthorizationPending = errors.New("authorization_pending")
+	ErrSlowDown             = errors.New("slow_down")
+	ErrExpiredToken         = errors.New("expired_token")
+	ErrAccessDenied         = errors.New("access_denied")
+)
+
+// PollToken polls POST /device/token once. It returns the user token on
+// approval, or one of the Err* sentinels (pending/slow_down → keep polling,
+// expired/denied → stop), or an *APIError for anything else.
+func (c *Client) PollToken(ctx context.Context, deviceCode string) (string, error) {
+	url := c.BaseURL + "/device/token"
+	status, raw, err := c.post(ctx, "/device/token", map[string]string{"device_code": deviceCode})
+	if err != nil {
+		return "", err
+	}
+	var body struct {
+		Token string `json:"token"`
+		Error string `json:"error"`
+	}
+	_ = json.Unmarshal(raw, &body) // best-effort; the status + error field drive the result
+	if status >= 200 && status < 300 {
+		if body.Token == "" {
+			return "", fmt.Errorf("device-token success response missing token (got %q)", string(raw))
+		}
+		return body.Token, nil
+	}
+	switch body.Error {
+	case "authorization_pending":
+		return "", ErrAuthorizationPending
+	case "slow_down":
+		return "", ErrSlowDown
+	case "expired_token":
+		return "", ErrExpiredToken
+	case "access_denied":
+		return "", ErrAccessDenied
+	}
+	return "", &APIError{StatusCode: status, Body: string(raw), URL: url}
+}
+
+// ── Authenticated calls (Bearer ClientAccessToken) ──
+
+// Identity is the signed-in user, from GET /userinfo/.
+type Identity struct {
+	Email   string `json:"email"`
+	Type    string `json:"type"`
+	Account string `json:"account"`
+}
+
+// WhoAmI fetches the signed-in user from the backend, authenticating with the
+// stored token (Bearer). It confirms the token is live and returns the account
+// — `login` uses it to verify the credential it just obtained. Requires Token.
+func (c *Client) WhoAmI(ctx context.Context) (*Identity, error) {
+	url := c.BaseURL + "/userinfo/"
+	status, raw, err := c.get(ctx, "/userinfo/")
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var id Identity
+	if err := json.Unmarshal(raw, &id); err != nil {
+		return nil, fmt.Errorf("decoding userinfo response: %w", err)
+	}
+	return &id, nil
+}
diff --git a/internal/api/client_test.go b/internal/api/client_test.go
new file mode 100644
index 0000000..41af464
--- /dev/null
+++ b/internal/api/client_test.go
@@ -0,0 +1,140 @@
+package api
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestBaseURL(t *testing.T) {
+	cases := map[string]string{
+		"dev":   "https://dev-api.tracebloc.io",
+		"stg":   "https://stg-api.tracebloc.io",
+		"prod":  "https://api.tracebloc.io",
+		"":      "https://api.tracebloc.io",
+		"DEV":   "https://dev-api.tracebloc.io", // case-insensitive
+		"weird": "https://api.tracebloc.io",     // unknown -> prod
+	}
+	for env, want := range cases {
+		if got := BaseURL(env); got != want {
+			t.Errorf("BaseURL(%q) = %q, want %q", env, got, want)
+		}
+	}
+}
+
+func TestResolveEnv(t *testing.T) {
+	t.Setenv("CLIENT_ENV", "stg")
+	if got := ResolveEnv("dev"); got != "dev" {
+		t.Errorf("explicit should win: got %q", got)
+	}
+	if got := ResolveEnv(""); got != "stg" {
+		t.Errorf("CLIENT_ENV should be used: got %q", got)
+	}
+	t.Setenv("CLIENT_ENV", "")
+	if got := ResolveEnv(""); got != "prod" {
+		t.Errorf("default should be prod: got %q", got)
+	}
+}
+
+func TestRequestDeviceCode(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/device/code" || r.Method != http.MethodPost {
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+		_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	resp, err := c.RequestDeviceCode(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if resp.UserCode != "WDJB-MJHT" || resp.Interval != 5 || resp.DeviceCode != "dc" {
+		t.Errorf("got %+v", resp)
+	}
+}
+
+func TestPollTokenSequence(t *testing.T) {
+	var n int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		n++
+		switch n {
+		case 1:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"authorization_pending"}`))
+		case 2:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"slow_down"}`))
+		default:
+			_, _ = w.Write([]byte(`{"token":"usertoken123"}`))
+		}
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	ctx := context.Background()
+	if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrAuthorizationPending) {
+		t.Errorf("poll 1: want pending, got %v", err)
+	}
+	if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrSlowDown) {
+		t.Errorf("poll 2: want slow_down, got %v", err)
+	}
+	tok, err := c.PollToken(ctx, "dc")
+	if err != nil || tok != "usertoken123" {
+		t.Errorf("poll 3: want token, got %q / %v", tok, err)
+	}
+}
+
+func TestPollTokenDenied(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusBadRequest)
+		_, _ = w.Write([]byte(`{"error":"access_denied"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	if _, err := c.PollToken(context.Background(), "dc"); !errors.Is(err, ErrAccessDenied) {
+		t.Errorf("want access_denied, got %v", err)
+	}
+}
+
+func TestWhoAmI(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/userinfo/" || r.Method != http.MethodGet {
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+		if got := r.Header.Get("Authorization"); got != "Bearer usertoken123" {
+			t.Errorf("auth header = %q, want %q", got, "Bearer usertoken123")
+		}
+		_, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","type":"DS","account":"Acme"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	c.Token = "usertoken123"
+	id, err := c.WhoAmI(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if id.Email != "ds@tracebloc.io" || id.Account != "Acme" {
+		t.Errorf("got %+v", id)
+	}
+}
+
+func TestWhoAmIUnauthorized(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusUnauthorized)
+		_, _ = w.Write([]byte(`{"detail":"Invalid token."}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	c.Token = "bad"
+	var ae *APIError
+	if _, err := c.WhoAmI(context.Background()); !errors.As(err, &ae) || ae.StatusCode != http.StatusUnauthorized {
+		t.Errorf("want APIError 401, got %v", err)
+	}
+}
diff --git a/internal/cli/auth.go b/internal/cli/auth.go
new file mode 100644
index 0000000..6f15965
--- /dev/null
+++ b/internal/cli/auth.go
@@ -0,0 +1,205 @@
+package cli
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// newLoginCmd implements `tracebloc login` — browser sign-in via the OAuth 2.0
+// Device Authorization Grant (RFC 8628). Works on a headless box: the CLI shows
+// a URL + short code, the human approves in a browser on any device, and the
+// CLI polls until a user token is issued and stored in ~/.tracebloc (0600).
+// The backend endpoints land in backend#835; until then login reports that the
+// backend doesn't support browser sign-in yet.
+func newLoginCmd() *cobra.Command {
+	var envFlag string
+	cmd := &cobra.Command{
+		Use:   "login",
+		Short: "Sign in to tracebloc in your browser (device flow)",
+		Long: `Sign in to tracebloc. The CLI prints a URL + short code; open the URL
+on any device (your laptop or phone), sign in the way you already do
+(password, Google, or GitHub), and approve the code. The CLI stores a
+user token in ~/.tracebloc (mode 0600).
+
+Works on a headless / SSH box — the browser and the CLI need not share a
+machine. Honors HTTP(S)_PROXY / NO_PROXY for corporate-proxy networks.`,
+		Args: cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runLogin(cmd.Context(), printerFor(cmd), envFlag)
+		},
+	}
+	cmd.Flags().StringVar(&envFlag, "env", "",
+		"backend environment: dev|stg|prod (default: $CLIENT_ENV, then prod)")
+	return cmd
+}
+
+// Test seams: the device flow makes real HTTP calls on a timer, so tests
+// override the client factory (point it at an httptest server) and the poll
+// clock (fire immediately) rather than hitting the network / wall clock.
+var (
+	newAPIClient = api.New
+	pollAfter    = time.After
+)
+
+func runLogin(ctx context.Context, p *ui.Printer, envFlag string) error {
+	cfg, err := config.Load()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	env := api.ResolveEnv(envFlag)
+	client := newAPIClient(env)
+
+	dc, err := client.RequestDeviceCode(ctx)
+	if err != nil {
+		var ae *api.APIError
+		if errors.As(err, &ae) && ae.StatusCode == http.StatusNotFound {
+			return &exitError{code: 1, err: fmt.Errorf(
+				"this backend (%s) doesn't support browser login yet — the device-grant "+
+					"endpoints land in backend#835: %w", env, err)}
+		}
+		return &exitError{code: 1, err: err}
+	}
+
+	p.Section("Sign in to tracebloc")
+	uri := dc.VerificationURIComplete
+	if uri == "" {
+		uri = dc.VerificationURI
+	}
+	p.Field("open", uri)
+	p.Field("code", dc.UserCode)
+	p.Newline()
+	p.Hintf("Waiting for you to approve in the browser… (Ctrl-C to cancel)")
+
+	interval := dc.Interval
+	if interval <= 0 {
+		interval = 5
+	}
+	var deadline time.Time
+	if dc.ExpiresIn > 0 {
+		deadline = time.Now().Add(time.Duration(dc.ExpiresIn) * time.Second)
+	}
+
+	for {
+		if !deadline.IsZero() && time.Now().After(deadline) {
+			return &exitError{code: 1, err: errors.New("login timed out — re-run `tracebloc login`")}
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-pollAfter(time.Duration(interval) * time.Second):
+		}
+
+		tok, err := client.PollToken(ctx, dc.DeviceCode)
+		switch {
+		case err == nil:
+			cfg.Env = env
+			cfg.Token = tok
+			// Confirm the freshly-issued token actually authenticates, and
+			// capture the account to show + store. Best-effort: don't fail a
+			// successful sign-in just because this lookup couldn't run.
+			client.Token = tok
+			if id, werr := client.WhoAmI(ctx); werr == nil {
+				cfg.Email = id.Email
+			}
+			if err := cfg.Save(); err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			p.Newline()
+			if cfg.Email != "" {
+				p.Successf("Signed in as %s. Token saved to ~/.tracebloc (0600).", cfg.Email)
+			} else {
+				p.Successf("Signed in. Token saved to ~/.tracebloc (0600).")
+			}
+			return nil
+		case errors.Is(err, api.ErrAuthorizationPending):
+			// not approved yet — keep polling
+		case errors.Is(err, api.ErrSlowDown):
+			interval++
+		case errors.Is(err, api.ErrExpiredToken):
+			return &exitError{code: 1, err: errors.New("the sign-in code expired — re-run `tracebloc login`")}
+		case errors.Is(err, api.ErrAccessDenied):
+			return &exitError{code: 1, err: errors.New("sign-in was denied in the browser")}
+		default:
+			return &exitError{code: 1, err: err}
+		}
+	}
+}
+
+// newLogoutCmd implements `tracebloc logout` — clears the stored token.
+func newLogoutCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "logout",
+		Short: "Sign out (clear the stored token)",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			cfg, err := config.Load()
+			if err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			if !cfg.SignedIn() {
+				printerFor(cmd).Hintf("Already signed out.")
+				return nil
+			}
+			cfg.Token = ""
+			cfg.Email = ""
+			if err := cfg.Save(); err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			printerFor(cmd).Successf("Signed out.")
+			return nil
+		},
+	}
+}
+
+// newAuthCmd is the `tracebloc auth` parent; today it carries `auth status`.
+func newAuthCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "auth",
+		Short: "Inspect tracebloc authentication state",
+	}
+	cmd.AddCommand(newAuthStatusCmd())
+	return cmd
+}
+
+// newAuthStatusCmd implements `tracebloc auth status`.
+func newAuthStatusCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "status",
+		Short: "Show whether you're signed in, and to which backend",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			cfg, err := config.Load()
+			if err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			p := printerFor(cmd)
+			if !cfg.SignedIn() {
+				p.Hintf("Not signed in. Run `tracebloc login`.")
+				return nil
+			}
+			env := cfg.Env
+			if env == "" {
+				env = api.EnvProd
+			}
+			p.Section("tracebloc auth")
+			p.Field("status", "signed in")
+			p.Field("backend", env)
+			if cfg.Email != "" {
+				p.Field("account", cfg.Email)
+			}
+			if cfg.ActiveClientID != "" {
+				p.Field("active client", cfg.ActiveClientID)
+			}
+			return nil
+		},
+	}
+}
diff --git a/internal/cli/auth_test.go b/internal/cli/auth_test.go
new file mode 100644
index 0000000..5be742b
--- /dev/null
+++ b/internal/cli/auth_test.go
@@ -0,0 +1,163 @@
+package cli
+
+import (
+	"bytes"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+)
+
+// withTestBackend points the login command at an httptest server (via the
+// newAPIClient seam), makes polling instant (pollAfter seam), and isolates the
+// on-disk config to a temp dir. All are restored on cleanup.
+func withTestBackend(t *testing.T, h http.HandlerFunc) {
+	t.Helper()
+	srv := httptest.NewServer(h)
+	t.Cleanup(srv.Close)
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+
+	origClient, origAfter := newAPIClient, pollAfter
+	newAPIClient = func(string) *api.Client {
+		return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()}
+	}
+	pollAfter = func(time.Duration) <-chan time.Time {
+		ch := make(chan time.Time, 1)
+		ch <- time.Time{}
+		return ch
+	}
+	t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter })
+}
+
+func runCmd(t *testing.T, args ...string) (string, error) {
+	t.Helper()
+	root := NewRootCmd(BuildInfo{Version: "test"})
+	var out bytes.Buffer
+	root.SetOut(&out)
+	root.SetErr(&out)
+	root.SetArgs(args)
+	err := root.Execute()
+	return out.String(), err
+}
+
+func TestLogin_FullFlow(t *testing.T) {
+	var polls int
+	withTestBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+		case "/device/token":
+			polls++
+			if polls == 1 {
+				w.WriteHeader(http.StatusBadRequest)
+				_, _ = w.Write([]byte(`{"error":"authorization_pending"}`))
+				return
+			}
+			_, _ = w.Write([]byte(`{"token":"cat_abc"}`))
+		case "/userinfo/":
+			if got := r.Header.Get("Authorization"); got != "Bearer cat_abc" {
+				t.Errorf("userinfo auth header = %q, want %q", got, "Bearer cat_abc")
+			}
+			_, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","account":"Acme"}`))
+		default:
+			t.Errorf("unexpected request path %s", r.URL.Path)
+		}
+	})
+
+	out, err := runCmd(t, "login")
+	if err != nil {
+		t.Fatalf("login: %v", err)
+	}
+	if polls != 2 {
+		t.Errorf("expected 2 polls (pending then token), got %d", polls)
+	}
+	cfg, _ := config.Load()
+	if cfg.Token != "cat_abc" {
+		t.Errorf("stored token = %q, want cat_abc", cfg.Token)
+	}
+	if cfg.Email != "ds@tracebloc.io" {
+		t.Errorf("stored email = %q, want ds@tracebloc.io", cfg.Email)
+	}
+	if !strings.Contains(out, "ds@tracebloc.io") {
+		t.Errorf("expected output to show the account, got:\n%s", out)
+	}
+}
+
+func TestLogin_BackendUnsupported(t *testing.T) {
+	withTestBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	})
+	_, err := runCmd(t, "login")
+	if err == nil || !strings.Contains(err.Error(), "doesn't support browser login") {
+		t.Errorf("want unsupported-backend error, got %v", err)
+	}
+	cfg, _ := config.Load()
+	if cfg.SignedIn() {
+		t.Error("must not store a token when the backend has no device endpoints")
+	}
+}
+
+func TestLogin_Denied(t *testing.T) {
+	withTestBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","interval":5}`))
+		default:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"access_denied"}`))
+		}
+	})
+	_, err := runCmd(t, "login")
+	if err == nil || !strings.Contains(err.Error(), "denied") {
+		t.Errorf("want access-denied error, got %v", err)
+	}
+}
+
+func TestLogout(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Token: "x", Email: "e@co"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := runCmd(t, "logout")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg, _ := config.Load()
+	if cfg.SignedIn() {
+		t.Error("expected to be signed out")
+	}
+	if !strings.Contains(out, "Signed out") {
+		t.Errorf("got:\n%s", out)
+	}
+}
+
+func TestAuthStatus_SignedIn(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Env: "dev", Token: "x", Email: "ds@co"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := runCmd(t, "auth", "status")
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"signed in", "ds@co", "dev"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("status output missing %q, got:\n%s", want, out)
+		}
+	}
+}
+
+func TestAuthStatus_NotSignedIn(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	out, err := runCmd(t, "auth", "status")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(out, "Not signed in") {
+		t.Errorf("got:\n%s", out)
+	}
+}
diff --git a/internal/cli/client.go b/internal/cli/client.go
new file mode 100644
index 0000000..ddda04f
--- /dev/null
+++ b/internal/cli/client.go
@@ -0,0 +1,75 @@
+package cli
+
+import (
+	"errors"
+
+	"github.com/spf13/cobra"
+)
+
+// newClientCmd wires the `tracebloc client` subtree — provisioning and
+// selecting the client (machine) this host enrolls as. The verbs are stubbed
+// here: the implementation is cli#84 and depends on the backend device-grant
+// (backend#835, for the user token from `tracebloc login`) and provisioning
+// (backend#836). The command shape is in place now so the tree + help are
+// stable and `--name`/`--location` are pinned.
+func newClientCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "client",
+		Short: "Provision and manage tracebloc clients (machines)",
+		Long: `Provision a tracebloc client for this machine and list/select clients
+in your account.
+
+Requires sign-in first (` + "`tracebloc login`" + `). Implemented in cli#84;
+the backend it calls lands in backend#835 / #836.`,
+	}
+	cmd.AddCommand(newClientCreateCmd(), newClientListCmd(), newClientUseCmd())
+	return cmd
+}
+
+// errClientNotYet is the shared "this lands in cli#84" stub error.
+func errClientNotYet() error {
+	return &exitError{code: 1, err: errors.New(
+		"`tracebloc client` is not implemented yet — it lands in cli#84 and needs the " +
+			"backend device-grant (backend#835) + provisioning (backend#836). " +
+			"`tracebloc login` is the first piece.")}
+}
+
+func newClientCreateCmd() *cobra.Command {
+	var name, location string
+	cmd := &cobra.Command{
+		Use:   "create",
+		Short: "Provision a new client for this machine (--name, --location)",
+		Args:  cobra.NoArgs,
+		RunE: func(_ *cobra.Command, _ []string) error {
+			return errClientNotYet()
+		},
+	}
+	cmd.Flags().StringVar(&name, "name", "",
+		"human-readable client name (shown on your dashboard + carbon reports)")
+	cmd.Flags().StringVar(&location, "location", "",
+		"physical location zone for carbon footprint (e.g. DE); auto-detected + confirmed if omitted")
+	return cmd
+}
+
+func newClientListCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:     "list",
+		Aliases: []string{"ls"},
+		Short:   "List the clients in your account",
+		Args:    cobra.NoArgs,
+		RunE: func(_ *cobra.Command, _ []string) error {
+			return errClientNotYet()
+		},
+	}
+}
+
+func newClientUseCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "use <client-id>",
+		Short: "Enroll this machine as an existing client",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(_ *cobra.Command, _ []string) error {
+			return errClientNotYet()
+		},
+	}
+}
diff --git a/internal/cli/root.go b/internal/cli/root.go
index f1047d6..0bf6f8c 100644
--- a/internal/cli/root.go
+++ b/internal/cli/root.go
@@ -81,6 +81,11 @@ what's planned next.`,
 	root.AddCommand(newIngestCmd())
 	root.AddCommand(newClusterCmd())
 	root.AddCommand(newDatasetCmd())
+	// RFC-0001 (backend#830): browser sign-in + client provisioning.
+	root.AddCommand(newLoginCmd())
+	root.AddCommand(newLogoutCmd())
+	root.AddCommand(newAuthCmd())
+	root.AddCommand(newClientCmd())
 
 	// Bare `tracebloc` (no subcommand) renders a friendly home screen
 	// instead of cobra's raw usage dump. Subcommands and --help are
diff --git a/internal/config/config.go b/internal/config/config.go
new file mode 100644
index 0000000..5145fad
--- /dev/null
+++ b/internal/config/config.go
@@ -0,0 +1,122 @@
+// Package config persists the tracebloc CLI's user state — the backend
+// environment, the user token from `tracebloc login`, and the active client
+// for this machine — at ~/.tracebloc/config.json (mode 0600, it holds a
+// token). RFC-0001 (backend#830).
+package config
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+)
+
+// Config is the on-disk CLI state. Every field is omitempty so a
+// partially-configured file stays small and forward-compatible.
+type Config struct {
+	Env            string `json:"env,omitempty"`              // dev|stg|prod (mirrors CLIENT_ENV)
+	Email          string `json:"email,omitempty"`            // who is signed in (display only)
+	Token          string `json:"token,omitempty"`            // user token from device login
+	ActiveClientID string `json:"active_client_id,omitempty"` // client this machine enrolls as
+}
+
+// Dir is the config directory: $TRACEBLOC_CONFIG_DIR if set (tests / ops
+// override), else ~/.tracebloc.
+func Dir() (string, error) {
+	if d := os.Getenv("TRACEBLOC_CONFIG_DIR"); d != "" {
+		return d, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", fmt.Errorf("locating home directory: %w", err)
+	}
+	return filepath.Join(home, ".tracebloc"), nil
+}
+
+// Path is the config file path (Dir()/config.json).
+func Path() (string, error) {
+	dir, err := Dir()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Join(dir, "config.json"), nil
+}
+
+// Load reads the config. A missing file is NOT an error — it returns an empty
+// Config (a fresh machine that has never run `login`).
+func Load() (*Config, error) {
+	path, err := Path()
+	if err != nil {
+		return nil, err
+	}
+	data, err := os.ReadFile(path)
+	if errors.Is(err, fs.ErrNotExist) {
+		return &Config{}, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("reading %s: %w", path, err)
+	}
+	var c Config
+	if err := json.Unmarshal(data, &c); err != nil {
+		return nil, fmt.Errorf("parsing %s: %w", path, err)
+	}
+	return &c, nil
+}
+
+// Save writes the config 0600 (creating the dir 0700), atomically: a temp file
+// in the same dir then a rename, so a crash mid-write can't truncate the token.
+func (c *Config) Save() error {
+	dir, err := Dir()
+	if err != nil {
+		return err
+	}
+	if err := os.MkdirAll(dir, 0o700); err != nil {
+		return fmt.Errorf("creating %s: %w", dir, err)
+	}
+	data, err := json.MarshalIndent(c, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encoding config: %w", err)
+	}
+	data = append(data, '\n')
+
+	tmp, err := os.CreateTemp(dir, ".config-*.json")
+	if err != nil {
+		return fmt.Errorf("creating temp config: %w", err)
+	}
+	tmpName := tmp.Name()
+	defer func() { _ = os.Remove(tmpName) }() // no-op after a successful rename
+	if err := tmp.Chmod(0o600); err != nil {
+		_ = tmp.Close()
+		return fmt.Errorf("chmod temp config: %w", err)
+	}
+	if _, err := tmp.Write(data); err != nil {
+		_ = tmp.Close()
+		return fmt.Errorf("writing temp config: %w", err)
+	}
+	if err := tmp.Close(); err != nil {
+		return fmt.Errorf("closing temp config: %w", err)
+	}
+	path, _ := Path()
+	if err := os.Rename(tmpName, path); err != nil {
+		return fmt.Errorf("replacing %s: %w", path, err)
+	}
+	return nil
+}
+
+// Clear removes the config file (full sign-out + reset). A missing file is not
+// an error.
+func Clear() error {
+	path, err := Path()
+	if err != nil {
+		return err
+	}
+	if err := os.Remove(path); err != nil && !errors.Is(err, fs.ErrNotExist) {
+		return fmt.Errorf("removing %s: %w", path, err)
+	}
+	return nil
+}
+
+// SignedIn reports whether a user token is stored.
+func (c *Config) SignedIn() bool { return c.Token != "" }
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
new file mode 100644
index 0000000..53dd368
--- /dev/null
+++ b/internal/config/config_test.go
@@ -0,0 +1,64 @@
+package config
+
+import (
+	"os"
+	"testing"
+)
+
+func TestSaveLoadRoundTrip(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	in := &Config{Env: "dev", Email: "a@b.com", Token: "tok123", ActiveClientID: "edge_1"}
+	if err := in.Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := Load()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if *out != *in {
+		t.Fatalf("round-trip mismatch: %+v != %+v", out, in)
+	}
+	if !out.SignedIn() {
+		t.Error("SignedIn should be true when a token is present")
+	}
+}
+
+func TestSaveIs0600(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&Config{Token: "secret"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	p, _ := Path()
+	fi, err := os.Stat(p)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if fi.Mode().Perm() != 0o600 {
+		t.Errorf("config mode = %v, want 0600 (it holds a token)", fi.Mode().Perm())
+	}
+}
+
+func TestLoadMissingIsEmpty(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	c, err := Load()
+	if err != nil {
+		t.Fatalf("missing config should not error: %v", err)
+	}
+	if c.SignedIn() {
+		t.Errorf("missing config should be empty, got %+v", c)
+	}
+}
+
+func TestClear(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	_ = (&Config{Token: "x"}).Save()
+	if err := Clear(); err != nil {
+		t.Fatal(err)
+	}
+	if c, _ := Load(); c.SignedIn() {
+		t.Error("after Clear, should not be signed in")
+	}
+	if err := Clear(); err != nil {
+		t.Errorf("Clear on a missing file should be nil, got %v", err)
+	}
+}

From 7a38c82eadaffa800d229f54bc7b5d7ec0e60556 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Mon, 22 Jun 2026 11:04:25 +0200
Subject: [PATCH 09/15] feat(cli): client create / list / use commands (#84)
 (#92)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(cli): client create / list / use commands (#84)

RFC-0001 P1. Adds the `client` subtree the login flow hands off to:
provision a tracebloc client for this machine, list the account's clients,
and attach this machine to an existing one.

- internal/slug: Go port of RFC-0001 Appendix B (backend common/utils/slug.py)
  — DNS-1123 slugify (NFKD via x/text) + collision suffix + empty-slug guard,
  kept in lock-step with the backend that validates the result.
- internal/api: CreateClient / ListClients / ListClientAdmins against
  /edge-device/, Bearer-authed (backend#836).
- internal/cli/client.go: create (--name / --location / --yes), list (ls),
  use <id>. The derived namespace is shown for confirmation; location is
  required (never silent-empty); a 403 surfaces the ask-an-admin path
  (backend#836); the generated machine credential is printed once.

Location auto-detect (cloud-metadata / GeoIP suggested default) is a
fast-follow — this PR takes --location or prompts for it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* refactor(cli): paginate client list + gather-then-review + parity/interactive tests

Self-review follow-up on the client commands:

- api: ListClients now follows DRF `next` to the end (was page-1 only), so
  `list`, `use <id>`, and create-time collision detection see every client in
  the account, not just the first page.
- cli: `create` gathers name + location first, then shows one review + a single
  confirm (was confirm-mid-flow) — matches the dataset-push interactive flow.
- tests: committed slug golden-parity test (24 pairs verified byte-identical
  against the Python slugify_dns1123, incl. NFKD ligatures/fractions/roman/
  fullwidth); interactive create + cancel via the prompter seam; paginated
  list; collision-suffix end-to-end.
- slug: doc-note the redundant dash-collapse (mirrors slug.py) and the
  ""/None fallback divergence.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/api/client.go            | 125 ++++++++++++++
 internal/cli/client.go            | 267 +++++++++++++++++++++++++++---
 internal/cli/client_test.go       | 233 ++++++++++++++++++++++++++
 internal/slug/slug.go             |  94 +++++++++++
 internal/slug/slug_golden_test.go |  44 +++++
 internal/slug/slug_test.go        |  45 +++++
 6 files changed, 783 insertions(+), 25 deletions(-)
 create mode 100644 internal/cli/client_test.go
 create mode 100644 internal/slug/slug.go
 create mode 100644 internal/slug/slug_golden_test.go
 create mode 100644 internal/slug/slug_test.go

diff --git a/internal/api/client.go b/internal/api/client.go
index 0bb22d3..a636620 100644
--- a/internal/api/client.go
+++ b/internal/api/client.go
@@ -14,6 +14,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"os"
 	"strings"
 	"time"
@@ -249,3 +250,127 @@ func (c *Client) WhoAmI(ctx context.Context) (*Identity, error) {
 	}
 	return &id, nil
 }
+
+// ── Client provisioning (Bearer-authed) — backend#836, /edge-device/ ──
+
+// ProvisionedClient is a tracebloc client (machine), as returned by the
+// EdgeDevice endpoints.
+type ProvisionedClient struct {
+	ID        int    `json:"id"`
+	Name      string `json:"first_name"`
+	Username  string `json:"username"`
+	Namespace string `json:"namespace"`
+	Location  string `json:"location"`
+	Status    int    `json:"status"`
+}
+
+// CreateClientRequest is the POST /edge-device/ body. The account is stamped
+// server-side from the token; password is the machine credential the caller
+// generates (write-only on the backend).
+type CreateClientRequest struct {
+	Name      string `json:"first_name"`
+	Namespace string `json:"namespace"`
+	Location  string `json:"location"`
+	Password  string `json:"password"`
+}
+
+// AdminContact is one "ask an admin" entry from GET /edge-device/admins/.
+type AdminContact struct {
+	Name  string `json:"name"`
+	Email string `json:"email"`
+}
+
+// CreateClient provisions a client. A 403 *APIError means the caller lacks
+// CLIENT_WRITE — callers fall back to ListClientAdmins for the ask-an-admin
+// path (backend#836 Q4).
+func (c *Client) CreateClient(ctx context.Context, req CreateClientRequest) (*ProvisionedClient, error) {
+	url := c.BaseURL + "/edge-device/"
+	status, raw, err := c.post(ctx, "/edge-device/", req)
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out ProvisionedClient
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, fmt.Errorf("decoding create-client response: %w", err)
+	}
+	return &out, nil
+}
+
+// maxListPages bounds how many pages ListClients will follow — a backstop
+// against a misbehaving `next` chain, set well above any real account.
+const maxListPages = 100
+
+// ListClients returns ALL clients in the caller's account (GET /edge-device/).
+// The endpoint is DRF-paginated, so this follows `next` to the end — list,
+// `use <id>`, and create-time collision detection must see every client, not
+// just the first page. Also tolerates a bare (unpaginated) list body.
+func (c *Client) ListClients(ctx context.Context) ([]ProvisionedClient, error) {
+	var all []ProvisionedClient
+	path := "/edge-device/"
+	for pageNum := 0; path != ""; pageNum++ {
+		if pageNum >= maxListPages {
+			return nil, fmt.Errorf("client list exceeded %d pages — aborting", maxListPages)
+		}
+		reqURL := c.BaseURL + path
+		status, raw, err := c.get(ctx, path)
+		if err != nil {
+			return nil, err
+		}
+		if status < 200 || status >= 300 {
+			return nil, &APIError{StatusCode: status, Body: string(raw), URL: reqURL}
+		}
+		// Unpaginated deployment → a bare array; return it as-is.
+		var bare []ProvisionedClient
+		if err := json.Unmarshal(raw, &bare); err == nil {
+			return append(all, bare...), nil
+		}
+		var body struct {
+			Next    string              `json:"next"`
+			Results []ProvisionedClient `json:"results"`
+		}
+		if err := json.Unmarshal(raw, &body); err != nil {
+			return nil, fmt.Errorf("decoding client list: %w", err)
+		}
+		all = append(all, body.Results...)
+		path = nextPath(body.Next)
+	}
+	return all, nil
+}
+
+// nextPath reduces a DRF `next` link (an absolute URL) to the path+query this
+// client appends to BaseURL. Returns "" for an empty/unparseable link, which
+// ends the pagination loop.
+func nextPath(next string) string {
+	if next == "" {
+		return ""
+	}
+	u, err := url.Parse(next)
+	if err != nil {
+		return ""
+	}
+	if u.RawQuery != "" {
+		return u.Path + "?" + u.RawQuery
+	}
+	return u.Path
+}
+
+// ListClientAdmins returns who in the account can provision (the ask-an-admin
+// path), from GET /edge-device/admins/ (backend#836 Q4).
+func (c *Client) ListClientAdmins(ctx context.Context) ([]AdminContact, error) {
+	url := c.BaseURL + "/edge-device/admins/"
+	status, raw, err := c.get(ctx, "/edge-device/admins/")
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out []AdminContact
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, fmt.Errorf("decoding admins response: %w", err)
+	}
+	return out, nil
+}
diff --git a/internal/cli/client.go b/internal/cli/client.go
index ddda04f..be66b41 100644
--- a/internal/cli/client.go
+++ b/internal/cli/client.go
@@ -1,53 +1,53 @@
 package cli
 
 import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
 	"errors"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
 
 	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/slug"
+	"github.com/tracebloc/cli/internal/ui"
 )
 
-// newClientCmd wires the `tracebloc client` subtree — provisioning and
-// selecting the client (machine) this host enrolls as. The verbs are stubbed
-// here: the implementation is cli#84 and depends on the backend device-grant
-// (backend#835, for the user token from `tracebloc login`) and provisioning
-// (backend#836). The command shape is in place now so the tree + help are
-// stable and `--name`/`--location` are pinned.
+// newClientCmd wires the `tracebloc client` subtree — provisioning + selecting
+// the client (machine) this host enrolls as. Consumes the backend provisioning
+// endpoints (backend#836) with the user token from `tracebloc login`.
 func newClientCmd() *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "client",
 		Short: "Provision and manage tracebloc clients (machines)",
 		Long: `Provision a tracebloc client for this machine and list/select clients
-in your account.
-
-Requires sign-in first (` + "`tracebloc login`" + `). Implemented in cli#84;
-the backend it calls lands in backend#835 / #836.`,
+in your account.  Requires sign-in first (` + "`tracebloc login`" + `).`,
 	}
 	cmd.AddCommand(newClientCreateCmd(), newClientListCmd(), newClientUseCmd())
 	return cmd
 }
 
-// errClientNotYet is the shared "this lands in cli#84" stub error.
-func errClientNotYet() error {
-	return &exitError{code: 1, err: errors.New(
-		"`tracebloc client` is not implemented yet — it lands in cli#84 and needs the " +
-			"backend device-grant (backend#835) + provisioning (backend#836). " +
-			"`tracebloc login` is the first piece.")}
-}
-
 func newClientCreateCmd() *cobra.Command {
 	var name, location string
+	var yes bool
 	cmd := &cobra.Command{
 		Use:   "create",
 		Short: "Provision a new client for this machine (--name, --location)",
 		Args:  cobra.NoArgs,
-		RunE: func(_ *cobra.Command, _ []string) error {
-			return errClientNotYet()
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(), name, location, yes)
 		},
 	}
 	cmd.Flags().StringVar(&name, "name", "",
 		"human-readable client name (shown on your dashboard + carbon reports)")
 	cmd.Flags().StringVar(&location, "location", "",
-		"physical location zone for carbon footprint (e.g. DE); auto-detected + confirmed if omitted")
+		"location zone for carbon footprint (e.g. DE); prompted if omitted")
+	cmd.Flags().BoolVar(&yes, "yes", false, "skip the confirmation prompt")
 	return cmd
 }
 
@@ -57,8 +57,8 @@ func newClientListCmd() *cobra.Command {
 		Aliases: []string{"ls"},
 		Short:   "List the clients in your account",
 		Args:    cobra.NoArgs,
-		RunE: func(_ *cobra.Command, _ []string) error {
-			return errClientNotYet()
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClientList(cmd.Context(), printerFor(cmd))
 		},
 	}
 }
@@ -68,8 +68,225 @@ func newClientUseCmd() *cobra.Command {
 		Use:   "use <client-id>",
 		Short: "Enroll this machine as an existing client",
 		Args:  cobra.ExactArgs(1),
-		RunE: func(_ *cobra.Command, _ []string) error {
-			return errClientNotYet()
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return runClientUse(cmd.Context(), printerFor(cmd), args[0])
 		},
 	}
 }
+
+// clientPrompter returns the interactive prompter on a TTY, else nil (so
+// commands fall back to flags-only and never block on a pipe / in CI).
+func clientPrompter() prompter {
+	if isInteractiveTTY() {
+		return surveyPrompter{}
+	}
+	return nil
+}
+
+// authedClient loads the signed-in config and returns a token-bearing API
+// client, or an error telling the user to log in.
+func authedClient() (*api.Client, *config.Config, error) {
+	cfg, err := config.Load()
+	if err != nil {
+		return nil, nil, err
+	}
+	if !cfg.SignedIn() {
+		return nil, nil, errors.New("not signed in — run `tracebloc login` first")
+	}
+	env := cfg.Env
+	if env == "" {
+		env = api.ResolveEnv("")
+	}
+	client := newAPIClient(env)
+	client.Token = cfg.Token
+	return client, cfg, nil
+}
+
+func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, name, location string, yes bool) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+
+	// Gather inputs first (flags win; prompt only what's missing, and only on a
+	// TTY), then show one review + confirm — matching the dataset-push flow.
+	if name == "" {
+		if pr == nil {
+			return errMissingFlag("--name")
+		}
+		if name, err = pr.Input("Client name", "shown on your dashboard + carbon reports", "", validateNonEmpty); err != nil {
+			return mapClientErr(err)
+		}
+	}
+	if location == "" {
+		if pr == nil {
+			return errMissingFlag("--location")
+		}
+		// Never silent-empty: the prompt requires a non-empty zone. (Cloud /
+		// GeoIP auto-detect of a suggested default is a fast-follow.)
+		if location, err = pr.Input("Location zone (e.g. DE)", "physical zone, for the carbon footprint", "", validateNonEmpty); err != nil {
+			return mapClientErr(err)
+		}
+	}
+
+	// Derive the namespace slug from the name, avoiding collisions with existing
+	// clients (best-effort: if the list call fails we still derive a base slug).
+	var existing []string
+	if clients, lerr := client.ListClients(ctx); lerr == nil {
+		for _, c := range clients {
+			if c.Namespace != "" {
+				existing = append(existing, c.Namespace)
+			}
+		}
+	}
+	namespace, err := slug.Derive(name, existing, "client-"+randHex(4))
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+
+	if pr != nil && !yes {
+		renderClientReview(p, name, namespace, location)
+		ok, cerr := pr.Confirm("Provision this client?", true)
+		if cerr != nil {
+			return mapClientErr(cerr)
+		}
+		if !ok {
+			p.Hintf("Cancelled.")
+			return nil
+		}
+	}
+
+	// The machine credential: the CLI generates the password, the backend stores
+	// it (write-only). The client-runtime authenticates with username+password.
+	password := randHex(24)
+	pc, err := client.CreateClient(ctx, api.CreateClientRequest{
+		Name:      name,
+		Namespace: namespace,
+		Location:  location,
+		Password:  password,
+	})
+	if err != nil {
+		var ae *api.APIError
+		if errors.As(err, &ae) && ae.StatusCode == http.StatusForbidden {
+			return askAnAdmin(ctx, p, client)
+		}
+		return &exitError{code: 1, err: err}
+	}
+
+	cfg.ActiveClientID = strconv.Itoa(pc.ID)
+	if serr := cfg.Save(); serr != nil {
+		return &exitError{code: 1, err: serr}
+	}
+
+	p.Newline()
+	p.Successf("Provisioned client %q (namespace %s).", pc.Name, pc.Namespace)
+	p.Section("Machine credential — needed by the installer to connect this client")
+	p.Field("client id", strconv.Itoa(pc.ID))
+	p.Field("username", pc.Username)
+	p.Field("password", password)
+	return nil
+}
+
+// askAnAdmin renders the "you can't provision — here's who can" path (a 403 from
+// the backend means no CLIENT_WRITE; backend#836 Q4).
+func askAnAdmin(ctx context.Context, p *ui.Printer, client *api.Client) error {
+	p.Newline()
+	p.Hintf("You don't have permission to provision a client in this account.")
+	if admins, err := client.ListClientAdmins(ctx); err == nil && len(admins) > 0 {
+		p.Section("Ask one of these admins to provision it (or grant you access)")
+		for _, a := range admins {
+			label := a.Name
+			if label == "" {
+				label = a.Email
+			}
+			p.Field(label, a.Email)
+		}
+	}
+	return &exitError{code: 1, err: errors.New("provisioning requires CLIENT_WRITE permission")}
+}
+
+func runClientList(ctx context.Context, p *ui.Printer) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	clients, err := client.ListClients(ctx)
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	if len(clients) == 0 {
+		p.Hintf("No clients yet. Run `tracebloc client create`.")
+		return nil
+	}
+	p.Section("Clients in your account")
+	for _, c := range clients {
+		marker := ""
+		if strconv.Itoa(c.ID) == cfg.ActiveClientID {
+			marker = "  (active)"
+		}
+		p.Field(strconv.Itoa(c.ID)+marker,
+			fmt.Sprintf("%s   namespace=%s   location=%s", c.Name, c.Namespace, c.Location))
+	}
+	return nil
+}
+
+func runClientUse(ctx context.Context, p *ui.Printer, id string) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	clients, err := client.ListClients(ctx)
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	for _, c := range clients {
+		if strconv.Itoa(c.ID) == id {
+			cfg.ActiveClientID = id
+			if serr := cfg.Save(); serr != nil {
+				return &exitError{code: 1, err: serr}
+			}
+			p.Successf("This machine is now set to enroll as client %s (%s).", id, c.Name)
+			return nil
+		}
+	}
+	return &exitError{code: 1, err: fmt.Errorf(
+		"no client %s in your account — run `tracebloc client list` to see the ids", id)}
+}
+
+// renderClientReview shows the assembled inputs before the confirm prompt, so
+// the user sees the derived namespace and location before anything is created.
+func renderClientReview(p *ui.Printer, name, namespace, location string) {
+	p.Section("Review")
+	p.Field("name", name)
+	p.Field("namespace", namespace)
+	p.Field("location", location)
+}
+
+// errMissingFlag reports a required flag absent in a non-interactive run (no TTY
+// to prompt — CI, a pipe, or output redirected).
+func errMissingFlag(flag string) error {
+	return &exitError{code: 1, err: fmt.Errorf("%s is required (non-interactive — no TTY to prompt)", flag)}
+}
+
+// validateNonEmpty rejects blank prompt input.
+func validateNonEmpty(s string) error {
+	if strings.TrimSpace(s) == "" {
+		return errors.New("required")
+	}
+	return nil
+}
+
+// mapClientErr turns a cancelled interactive prompt into a clean exit.
+func mapClientErr(err error) error {
+	if errors.Is(err, errInteractiveCancelled) {
+		return nil
+	}
+	return &exitError{code: 1, err: err}
+}
+
+// randHex returns nbytes of crypto-random data hex-encoded.
+func randHex(nbytes int) string {
+	b := make([]byte, nbytes)
+	_, _ = rand.Read(b) // crypto/rand.Read does not fail on a healthy system
+	return hex.EncodeToString(b)
+}
diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go
new file mode 100644
index 0000000..d4ac0e4
--- /dev/null
+++ b/internal/cli/client_test.go
@@ -0,0 +1,233 @@
+package cli
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// withClientBackend points the client commands at an httptest server (via the
+// newAPIClient seam) and writes a signed-in config to a temp dir.
+func withClientBackend(t *testing.T, h http.HandlerFunc) {
+	t.Helper()
+	srv := httptest.NewServer(h)
+	t.Cleanup(srv.Close)
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Env: "dev", Token: "tok"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	orig := newAPIClient
+	newAPIClient = func(string) *api.Client {
+		return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()}
+	}
+	t.Cleanup(func() { newAPIClient = orig })
+}
+
+func TestClientCreate_Success(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			_, _ = w.Write([]byte(`[]`)) // no existing clients
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			if got := r.Header.Get("Authorization"); got != "Bearer tok" {
+				t.Errorf("auth header = %q, want Bearer tok", got)
+			}
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"my-client","username":"u-123","namespace":"my-client","location":"DE"}`))
+		default:
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+	})
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, "my-client", "DE", true); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.Namespace != "my-client" || body.Location != "DE" || body.Password == "" {
+		t.Errorf("create body = %+v", body)
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "5" {
+		t.Errorf("active client = %q, want 5", cfg.ActiveClientID)
+	}
+	if !strings.Contains(out.String(), "u-123") {
+		t.Errorf("output missing username:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_AskAnAdmin(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/edge-device/" && r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.URL.Path == "/edge-device/" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusForbidden)
+			_, _ = w.Write([]byte(`{"detail":"no permission"}`))
+		case r.URL.Path == "/edge-device/admins/":
+			_, _ = w.Write([]byte(`[{"name":"Ada","email":"ada@co.io"}]`))
+		}
+	})
+	var out bytes.Buffer
+	err := runClientCreate(context.Background(), ui.New(&out), nil, "my-client", "DE", true)
+	if err == nil || !strings.Contains(err.Error(), "CLIENT_WRITE") {
+		t.Errorf("want permission error, got %v", err)
+	}
+	if !strings.Contains(out.String(), "ada@co.io") {
+		t.Errorf("expected admins shown, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_RequiresLogin(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) // no config → not signed in
+	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, "x", "DE", true)
+	if err == nil || !strings.Contains(err.Error(), "login") {
+		t.Errorf("want not-signed-in error, got %v", err)
+	}
+}
+
+func TestClientList(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":1,"first_name":"alpha","namespace":"alpha","location":"DE"},{"id":2,"first_name":"beta","namespace":"beta","location":"US"}]`))
+	})
+	var out bytes.Buffer
+	if err := runClientList(context.Background(), ui.New(&out)); err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"alpha", "beta"} {
+		if !strings.Contains(out.String(), want) {
+			t.Errorf("list missing %q:\n%s", want, out.String())
+		}
+	}
+}
+
+func TestClientUse(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":7,"first_name":"gamma","namespace":"gamma"}]`))
+	})
+	if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "7"); err != nil {
+		t.Fatal(err)
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "7" {
+		t.Errorf("active = %q, want 7", cfg.ActiveClientID)
+	}
+	if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "99"); err == nil {
+		t.Error("expected an error for an unknown client id")
+	}
+}
+
+func TestClientCreate_Interactive(t *testing.T) {
+	var body api.CreateClientRequest
+	posted := false
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			posted = true
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":9,"first_name":"Lab One","username":"u-9","namespace":"lab-one","location":"DE"}`))
+		}
+	})
+	confirmYes := true
+	pr := &fakePrompter{
+		answers: map[string]string{
+			"Client name":             "Lab One",
+			"Location zone (e.g. DE)": "DE",
+		},
+		confirm: &confirmYes,
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, "", "", false); err != nil {
+		t.Fatalf("interactive create: %v", err)
+	}
+	if !posted {
+		t.Fatal("expected a POST after the user confirmed")
+	}
+	if body.Name != "Lab One" || body.Namespace != "lab-one" || body.Location != "DE" {
+		t.Errorf("create body = %+v", body)
+	}
+	if !strings.Contains(out.String(), "Review") {
+		t.Errorf("expected a review section before the confirm, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_InteractiveCancel(t *testing.T) {
+	posted := false
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		if r.Method == http.MethodPost {
+			posted = true
+		}
+		_, _ = w.Write([]byte(`[]`))
+	})
+	confirmNo := false
+	pr := &fakePrompter{
+		answers: map[string]string{
+			"Client name":             "Lab Two",
+			"Location zone (e.g. DE)": "US",
+		},
+		confirm: &confirmNo,
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, "", "", false); err != nil {
+		t.Fatalf("declining the confirm should be a clean exit, got: %v", err)
+	}
+	if posted {
+		t.Error("no client should be created when the user declines the confirm")
+	}
+	if !strings.Contains(out.String(), "Cancelled") {
+		t.Errorf("expected a Cancelled note, got:\n%s", out.String())
+	}
+}
+
+func TestClientList_Paginated(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Query().Get("page") == "2" {
+			_, _ = w.Write([]byte(`{"count":2,"next":null,"results":[{"id":2,"first_name":"beta","namespace":"beta"}]}`))
+			return
+		}
+		// page 1: an absolute `next` link, like real DRF pagination
+		_, _ = fmt.Fprintf(w, `{"count":2,"next":"http://%s/edge-device/?page=2","results":[{"id":1,"first_name":"alpha","namespace":"alpha"}]}`, r.Host)
+	})
+	var out bytes.Buffer
+	if err := runClientList(context.Background(), ui.New(&out)); err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"alpha", "beta"} {
+		if !strings.Contains(out.String(), want) {
+			t.Errorf("paginated list missing %q (next not followed?):\n%s", want, out.String())
+		}
+	}
+}
+
+func TestClientCreate_CollisionSuffix(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			// an existing client already holds "my-client"
+			_, _ = w.Write([]byte(`[{"id":1,"first_name":"My Client","namespace":"my-client"}]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":2,"first_name":"My Client","username":"u-2","namespace":"my-client-2","location":"DE"}`))
+		}
+	})
+	if err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, "My Client", "DE", true); err != nil {
+		t.Fatal(err)
+	}
+	if body.Namespace != "my-client-2" {
+		t.Errorf("namespace = %q, want my-client-2 (collision suffix not applied)", body.Namespace)
+	}
+}
diff --git a/internal/slug/slug.go b/internal/slug/slug.go
new file mode 100644
index 0000000..fb3f8a0
--- /dev/null
+++ b/internal/slug/slug.go
@@ -0,0 +1,94 @@
+// Package slug ports the RFC-0001 namespace-slug rule (backend
+// common/utils/slug.py, RFC-0001 Appendix B) to Go. A client's display name is
+// slugified ONCE at provisioning into the immutable Kubernetes namespace, so
+// this MUST stay in lock-step with the Python definition — the backend
+// validates exactly what this produces (RFC-0001 backend#830; provisioning +
+// namespace validation in backend#836).
+package slug
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+
+	"golang.org/x/text/unicode/norm"
+)
+
+// MaxLabelLength is the DNS-1123 label cap.
+const MaxLabelLength = 63
+
+var (
+	nonAlnum  = regexp.MustCompile(`[^a-z0-9]+`)
+	multiDash = regexp.MustCompile(`-+`)
+)
+
+// Slugify maps name to a DNS-1123 label, or "" if nothing survives. Mirrors
+// slug.slugify_dns1123: NFKD-transliterate to ASCII, lowercase, map every run
+// of non-alphanumerics to a single "-", trim leading/trailing "-", cap at 63.
+func Slugify(name string) string {
+	if name == "" {
+		return ""
+	}
+	s := strings.ToLower(toASCII(name))
+	s = nonAlnum.ReplaceAllString(s, "-")
+	// Redundant after the run-collapse above, but slug.py runs the identical
+	// second pass — mirror it so the two stay structurally in lock-step.
+	s = multiDash.ReplaceAllString(s, "-")
+	s = strings.Trim(s, "-")
+	if len(s) > MaxLabelLength {
+		s = s[:MaxLabelLength]
+	}
+	return strings.TrimRight(s, "-")
+}
+
+// Derive returns a UNIQUE DNS-1123 slug for name, avoiding taken. On collision
+// it appends -2, -3, … within the 63-char cap. If name slugifies to empty it
+// falls back to fallback; with an empty fallback it errors (an empty slug must
+// never silently become a namespace). Mirrors slug.derive_slug — except Go has
+// no nil string, so fallback=="" is treated as "no fallback" (erroring); the
+// Python distinguishes None from "" but no caller passes "", and erroring is
+// the safer side of that edge.
+func Derive(name string, taken []string, fallback string) (string, error) {
+	base := Slugify(name)
+	if base == "" {
+		if fallback == "" {
+			return "", fmt.Errorf("name %q slugifies to empty; a fallback is required", name)
+		}
+		if base = Slugify(fallback); base == "" {
+			base = fallback
+		}
+	}
+	set := make(map[string]struct{}, len(taken))
+	for _, t := range taken {
+		set[t] = struct{}{}
+	}
+	if _, clash := set[base]; !clash {
+		return base, nil
+	}
+	for n := 2; ; n++ {
+		suffix := fmt.Sprintf("-%d", n)
+		end := MaxLabelLength - len(suffix)
+		if end > len(base) {
+			end = len(base)
+		}
+		if end < 0 {
+			end = 0
+		}
+		cand := strings.TrimRight(base[:end], "-") + suffix
+		if _, clash := set[cand]; !clash {
+			return cand, nil
+		}
+	}
+}
+
+func toASCII(s string) string {
+	// NFKD decompose then drop non-ASCII — matches Python's
+	// unicodedata.normalize("NFKD", s).encode("ascii", "ignore").
+	var b strings.Builder
+	for _, r := range norm.NFKD.String(s) {
+		if r < 128 {
+			b.WriteByte(byte(r))
+		}
+	}
+	return b.String()
+}
diff --git a/internal/slug/slug_golden_test.go b/internal/slug/slug_golden_test.go
new file mode 100644
index 0000000..04e25c2
--- /dev/null
+++ b/internal/slug/slug_golden_test.go
@@ -0,0 +1,44 @@
+package slug
+
+import "testing"
+
+// goldenPairs are (display name → DNS-1123 slug) pairs verified BYTE-IDENTICAL
+// against the canonical Python slugify_dns1123 (backend common/utils/slug.py)
+// with a cross-language harness. They lock the NFKD transliteration — the one
+// place a Go port can silently drift from the backend validator that rejects
+// what this produces. If slug.py ever changes, re-run the harness and update
+// these rather than hand-editing.
+var goldenPairs = []struct{ name, want string }{
+	{"My Client", "my-client"},
+	{"café", "cafe"}, // canonical NFKD: é → e + accent (dropped)
+	{"CAFÉ", "cafe"}, // + lowercase
+	{"Müller GmbH", "muller-gmbh"},
+	{"Straße 42", "strae-42"}, // ß does not decompose → dropped (matches Python)
+	{"naïve", "naive"},
+	{"São Paulo", "sao-paulo"},
+	{"Zürich-Lab", "zurich-lab"},
+	{"北京 client", "client"},           // CJK dropped
+	{"client🚀rocket", "clientrocket"}, // emoji dropped
+	{"über_cool", "uber-cool"},
+	{"piñata", "pinata"},
+	{"Ω omega", "omega"},
+	{"ﬁ-ligature", "fi-ligature"},  // compat NFKD: ﬁ ligature → "fi"
+	{"①②③ circled", "123-circled"}, // circled digits → "123"
+	{"½ half", "12-half"},          // vulgar fraction → "1" + fraction-slash(dropped) + "2"
+	{"ＦＵＬＬ width", "full-width"},   // fullwidth → ASCII
+	{"trailing---dashes---", "trailing-dashes"},
+	{"UPPER MixED", "upper-mixed"},
+	{"a.b.c-d_e f", "a-b-c-d-e-f"},
+	{"2001: A Space Odyssey", "2001-a-space-odyssey"},
+	{"Ⅻ roman", "xii-roman"}, // roman numeral → "XII" → "xii"
+	{"²cubed³", "2cubed3"},   // super/subscripts → digits
+	{"Hello @ World #1", "hello-world-1"},
+}
+
+func TestSlugifyGoldenParity(t *testing.T) {
+	for _, p := range goldenPairs {
+		if got := Slugify(p.name); got != p.want {
+			t.Errorf("Slugify(%q) = %q, want %q (golden parity with slug.py)", p.name, got, p.want)
+		}
+	}
+}
diff --git a/internal/slug/slug_test.go b/internal/slug/slug_test.go
new file mode 100644
index 0000000..ac5c516
--- /dev/null
+++ b/internal/slug/slug_test.go
@@ -0,0 +1,45 @@
+package slug
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestSlugify(t *testing.T) {
+	cases := map[string]string{
+		"My Client":             "my-client",
+		"café":                  "cafe", // NFKD transliteration, matches slug.py
+		"  spaces  ":            "spaces",
+		"UPPER_Case":            "upper-case",
+		"a--b":                  "a-b", // collapse consecutive dashes
+		"--trim--":              "trim",
+		"":                      "",
+		"!!!":                   "",
+		strings.Repeat("a", 70): strings.Repeat("a", 63), // 63-char cap
+	}
+	for in, want := range cases {
+		if got := Slugify(in); got != want {
+			t.Errorf("Slugify(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestDerive(t *testing.T) {
+	if got, err := Derive("My Client", nil, ""); err != nil || got != "my-client" {
+		t.Errorf("no collision: got %q, err %v", got, err)
+	}
+	if got, _ := Derive("My Client", []string{"my-client"}, ""); got != "my-client-2" {
+		t.Errorf("collision: got %q, want my-client-2", got)
+	}
+	if got, _ := Derive("My Client", []string{"my-client", "my-client-2"}, ""); got != "my-client-3" {
+		t.Errorf("double collision: got %q, want my-client-3", got)
+	}
+	// CJK slugifies to empty → uses fallback
+	if got, _ := Derive("世界", nil, "client-abc"); got != "client-abc" {
+		t.Errorf("fallback: got %q, want client-abc", got)
+	}
+	// empty slug + no fallback → error (never a silent-empty namespace)
+	if _, err := Derive("!!!", nil, ""); err == nil {
+		t.Error("expected an error for empty slug with no fallback")
+	}
+}

From 4d3a00e37d0b7a6cfcd51cb03b1d962e964dc0f3 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:23:20 +0200
Subject: [PATCH 10/15] chore(schema): re-vendor ingest.v1.json from
 data-ingestors (fix drift check) (#103)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CI "Schema drift check" (scripts/sync-schema.sh --check) was failing on develop
itself and therefore every PR — the vendored internal/schema/ingest.v1.json had
drifted from data-ingestors master, which added the `causal_language_modeling` task
category (data-ingestors#805: the enum + a self-supervised "requires texts, not
label" conditional).

- Re-vendored the schema (sync-schema.sh) → --check now passes.
- Registered the new category in internal/push/category.go as recognize-but-not-yet-
  CLI-supported (CLISupported:false + UnsupportedNote): the CLI's discover/build for
  its raw-.txt / prompt\tcompletion `texts` layout isn't implemented, so push cleanly
  reports it as pending rather than leaving a schema<->registry gap (the cli#74 drift
  class the registry exists to prevent). Updated the parity test.

Scope: schema re-vendor + the registry recognition only. Full CLI push support for
causal_language_modeling (discover/build) is a follow-up feature, not this PR.

go build/vet/test ./... green; drift check green.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/push/category.go               |  2 ++
 internal/push/category_registry_test.go |  8 ++++----
 internal/schema/ingest.v1.json          | 16 +++++++++++++---
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/internal/push/category.go b/internal/push/category.go
index 9ea14c3..b48640d 100644
--- a/internal/push/category.go
+++ b/internal/push/category.go
@@ -68,6 +68,8 @@ var categoryRegistry = []CategorySpec{
 		UnsupportedNote: "blocked on the ingestor's mask-sidecar support (data-ingestors#136)"},
 	{ID: "instance_segmentation", Family: FamilyImage, Label: "Instance segmentation", CLISupported: false,
 		UnsupportedNote: "not implemented"},
+	{ID: "causal_language_modeling", Family: FamilyText, Label: "Causal language modeling", CLISupported: false,
+		UnsupportedNote: "schema-recognized (data-ingestors#805); `tracebloc ingest` discover/build for its raw-.txt / prompt\\tcompletion `texts` layout is pending"},
 }
 
 // categoryByID indexes the registry for O(1) lookup, built once.
diff --git a/internal/push/category_registry_test.go b/internal/push/category_registry_test.go
index 1bb40ea..997fc2e 100644
--- a/internal/push/category_registry_test.go
+++ b/internal/push/category_registry_test.go
@@ -13,7 +13,7 @@ func TestRegistryKnownCategories(t *testing.T) {
 	want := []string{
 		"image_classification", "object_detection", "keypoint_detection",
 		"semantic_segmentation", "instance_segmentation",
-		"text_classification", "masked_language_modeling",
+		"text_classification", "masked_language_modeling", "causal_language_modeling",
 		"tabular_classification", "tabular_regression",
 		"time_series_forecasting", "time_to_event_prediction",
 	}
@@ -40,9 +40,9 @@ func TestSupportedCategories(t *testing.T) {
 			t.Errorf("SupportedCategoryIDs returned %q but IsCLISupported is false", id)
 		}
 	}
-	// semantic_/instance_segmentation are known but not yet pushable, and
-	// must explain why.
-	for _, id := range []string{"semantic_segmentation", "instance_segmentation"} {
+	// segmentation + causal_language_modeling are known but not yet pushable,
+	// and must explain why.
+	for _, id := range []string{"semantic_segmentation", "instance_segmentation", "causal_language_modeling"} {
 		if !IsKnown(id) {
 			t.Errorf("%s should be known", id)
 		}
diff --git a/internal/schema/ingest.v1.json b/internal/schema/ingest.v1.json
index 7f4f4a0..b5332b9 100644
--- a/internal/schema/ingest.v1.json
+++ b/internal/schema/ingest.v1.json
@@ -36,7 +36,8 @@
         "tabular_regression",
         "time_series_forecasting",
         "time_to_event_prediction",
-        "masked_language_modeling"
+        "masked_language_modeling",
+        "causal_language_modeling"
       ],
       "description": "Task category. Drives convention defaults: validators, data_format, default columns, default file extensions, default validator set."
     },
@@ -86,7 +87,7 @@
     "texts": {
       "type": "string",
       "minLength": 1,
-      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification and token_classification."
+      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification, token_classification, and causal_language_modeling (raw .txt: plain text, or a tab-separated prompt\\tcompletion pair)."
     },
 
     "sequences": {
@@ -336,6 +337,14 @@
       },
       "then": { "required": ["sequences"] }
     },
+    {
+      "description": "causal_language_modeling requires `texts` (raw .txt samples). It is self-supervised, so unlike text/token classification it does NOT require `label`.",
+      "if": {
+        "properties": { "category": { "const": "causal_language_modeling" } },
+        "required": ["category"]
+      },
+      "then": { "required": ["texts"] }
+    },
     {
       "description": "tabular and time-series categories require `schema`.",
       "if": {
@@ -411,7 +420,8 @@
         "properties": {
           "category": {
             "enum": [
-              "masked_language_modeling"
+              "masked_language_modeling",
+              "causal_language_modeling"
             ]
           }
         },

From 937a304adb80b75964063986600e1e0366e9db33 Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:36:37 +0200
Subject: [PATCH 11/15] =?UTF-8?q?feat(cli):=20client=20create=20reads=20th?=
 =?UTF-8?q?e=20cluster=20anchor=20=E2=80=94=20idempotent=20get-or-create?=
 =?UTF-8?q?=20+=20409=20(#84)=20(#102)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(cli): client create reads the cluster anchor — idempotent get-or-create + 409 (#84)

RFC-0001 §7.2 / backend#883: `client create` now reads the cluster's kube-system UID
and sends it as cluster_id, so the backend does get-or-create keyed on it.

- Reads the anchor via a new cluster.ClusterID (kube-system namespace UID) behind
  --kubeconfig/--context flags. Best-effort + never-silent: if the cluster isn't
  reachable it provisions WITHOUT an anchor (a plain mint) and says so.
- api.CreateClient returns adopted (HTTP 200) vs minted (201): an idempotent re-run
  on the same cluster adopts the existing client (no new credential printed) instead
  of duplicating; a 409 → a clear "registered to a different account" (cluster_conflict).
- Adds api.BackfillClusterID (PATCH /edge-device/<id>/) for the adopt-backfill path
  (the installer #838 orchestrates the full R7 flow).

Scope: anchor + idempotency only. never-show (writing the credential into the cluster
secret) and the R7 in-cluster TB_CLIENT_ID backfill orchestration land with the
installer reorder (#838); the mint-time credential print stays as the interim.

Tests: cluster.clusterIDFrom (fake clientset); api CreateClient mint/adopt/409 +
BackfillClusterID; cli create anchor-mint / adopt-idempotent / 409 / no-cluster-warns.
go build/vet/test ./... green (Go 1.26).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cli): bound the anchor read + make adopt save-failure non-fatal

Two review fixes folded into the create-anchor work (#84):

- cluster.ClusterID: cap the best-effort kube-system read with an 8s
  rest.Config timeout. A kubeconfig pointing at an unreachable API
  server would otherwise hang the GET for the OS TCP timeout; now
  `client create` degrades to a non-anchored mint promptly instead of
  stalling before the review prompt.
- cli client create: on an idempotent adopt, print the result before
  saving the active-client pointer and treat a save failure as a hint
  (mirroring the mint path), so a config-save error can't bury the
  "adopted it" message or the recovery hint.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Co-authored-by: Asad Iqbal <asad.dsoft@gmail.com>
---
 internal/api/client.go            |  27 ++++--
 internal/api/client_test.go       |  56 +++++++++++
 internal/cli/client.go            |  87 ++++++++++++++---
 internal/cli/client_test.go       | 153 ++++++++++++++++++++++++++++--
 internal/cluster/identity.go      |  50 ++++++++++
 internal/cluster/identity_test.go |  30 ++++++
 6 files changed, 375 insertions(+), 28 deletions(-)
 create mode 100644 internal/cluster/identity.go
 create mode 100644 internal/cluster/identity_test.go

diff --git a/internal/api/client.go b/internal/api/client.go
index a636620..fbd97bb 100644
--- a/internal/api/client.go
+++ b/internal/api/client.go
@@ -262,6 +262,9 @@ type ProvisionedClient struct {
 	Namespace string `json:"namespace"`
 	Location  string `json:"location"`
 	Status    int    `json:"status"`
+	// ClusterID is the kube-system namespace UID this client is anchored to
+	// (RFC-0001 §6.3 / backend#883). Empty on legacy / not-yet-backfilled clients.
+	ClusterID string `json:"cluster_id"`
 }
 
 // CreateClientRequest is the POST /edge-device/ body. The account is stamped
@@ -272,6 +275,10 @@ type CreateClientRequest struct {
 	Namespace string `json:"namespace"`
 	Location  string `json:"location"`
 	Password  string `json:"password"`
+	// ClusterID anchors the client to this cluster (the kube-system namespace UID)
+	// so create is get-or-create keyed on it (RFC-0001 §7.2 / backend#883). Omitted
+	// when the cluster identity can't be read (dual-mode / legacy → plain mint).
+	ClusterID string `json:"cluster_id,omitempty"`
 }
 
 // AdminContact is one "ask an admin" entry from GET /edge-device/admins/.
@@ -280,23 +287,27 @@ type AdminContact struct {
 	Email string `json:"email"`
 }
 
-// CreateClient provisions a client. A 403 *APIError means the caller lacks
-// CLIENT_WRITE — callers fall back to ListClientAdmins for the ask-an-admin
-// path (backend#836 Q4).
-func (c *Client) CreateClient(ctx context.Context, req CreateClientRequest) (*ProvisionedClient, error) {
+// CreateClient provisions a client — get-or-create keyed on cluster_id when one
+// is supplied (RFC-0001 §7.2 / backend#883). The returned `adopted` is true when
+// the backend matched an existing client for this cluster (HTTP 200, an idempotent
+// re-run) and false when it minted a new one (HTTP 201). A 403 *APIError means the
+// caller lacks CLIENT_WRITE (→ ask-an-admin, backend#836 Q4); a 409 *APIError means
+// the cluster is bound to another account (cluster_conflict, R6).
+func (c *Client) CreateClient(ctx context.Context, req CreateClientRequest) (pc *ProvisionedClient, adopted bool, err error) {
 	url := c.BaseURL + "/edge-device/"
 	status, raw, err := c.post(ctx, "/edge-device/", req)
 	if err != nil {
-		return nil, err
+		return nil, false, err
 	}
 	if status < 200 || status >= 300 {
-		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+		return nil, false, &APIError{StatusCode: status, Body: string(raw), URL: url}
 	}
 	var out ProvisionedClient
 	if err := json.Unmarshal(raw, &out); err != nil {
-		return nil, fmt.Errorf("decoding create-client response: %w", err)
+		return nil, false, fmt.Errorf("decoding create-client response: %w", err)
 	}
-	return &out, nil
+	// 200 = adopted an existing client for this cluster_id; 201 = freshly minted.
+	return &out, status == http.StatusOK, nil
 }
 
 // maxListPages bounds how many pages ListClients will follow — a backstop
diff --git a/internal/api/client_test.go b/internal/api/client_test.go
index 41af464..f5fbdcd 100644
--- a/internal/api/client_test.go
+++ b/internal/api/client_test.go
@@ -2,6 +2,7 @@ package api
 
 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"net/http"
 	"net/http/httptest"
@@ -138,3 +139,58 @@ func TestWhoAmIUnauthorized(t *testing.T) {
 		t.Errorf("want APIError 401, got %v", err)
 	}
 }
+
+func TestCreateClientMintAndAdopt(t *testing.T) {
+	for _, tc := range []struct {
+		name        string
+		code        int
+		wantAdopted bool
+	}{
+		{"mint", http.StatusCreated, false},
+		{"adopt", http.StatusOK, true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			var sent CreateClientRequest
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.URL.Path != "/edge-device/" || r.Method != http.MethodPost {
+					t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+				}
+				_ = json.NewDecoder(r.Body).Decode(&sent)
+				w.WriteHeader(tc.code)
+				_, _ = w.Write([]byte(`{"id":5,"first_name":"c","namespace":"c","cluster_id":"uid-1"}`))
+			}))
+			defer srv.Close()
+			c := New("prod")
+			c.BaseURL = srv.URL
+			pc, adopted, err := c.CreateClient(context.Background(),
+				CreateClientRequest{Name: "c", Namespace: "c", Password: "pw", ClusterID: "uid-1"})
+			if err != nil {
+				t.Fatal(err)
+			}
+			if adopted != tc.wantAdopted {
+				t.Errorf("adopted = %v, want %v", adopted, tc.wantAdopted)
+			}
+			if sent.ClusterID != "uid-1" {
+				t.Errorf("cluster_id sent = %q, want uid-1", sent.ClusterID)
+			}
+			if pc.ClusterID != "uid-1" {
+				t.Errorf("cluster_id parsed = %q, want uid-1", pc.ClusterID)
+			}
+		})
+	}
+}
+
+func TestCreateClientConflict(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusConflict)
+		_, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	var ae *APIError
+	_, _, err := c.CreateClient(context.Background(), CreateClientRequest{ClusterID: "uid-1"})
+	if !errors.As(err, &ae) || ae.StatusCode != http.StatusConflict {
+		t.Errorf("want APIError 409, got %v", err)
+	}
+}
diff --git a/internal/cli/client.go b/internal/cli/client.go
index be66b41..481ddf7 100644
--- a/internal/cli/client.go
+++ b/internal/cli/client.go
@@ -13,11 +13,17 @@ import (
 	"github.com/spf13/cobra"
 
 	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/cluster"
 	"github.com/tracebloc/cli/internal/config"
 	"github.com/tracebloc/cli/internal/slug"
 	"github.com/tracebloc/cli/internal/ui"
 )
 
+// readClusterID reads the cluster's kube-system UID — the RFC-0001 idempotency
+// anchor (§7.2 / backend#883). A package var so tests can stub it without a
+// reachable cluster.
+var readClusterID = cluster.ClusterID
+
 // newClientCmd wires the `tracebloc client` subtree — provisioning + selecting
 // the client (machine) this host enrolls as. Consumes the backend provisioning
 // endpoints (backend#836) with the user token from `tracebloc login`.
@@ -33,24 +39,35 @@ in your account.  Requires sign-in first (` + "`tracebloc login`" + `).`,
 }
 
 func newClientCreateCmd() *cobra.Command {
-	var name, location string
+	var name, location, kubeconfigPath, contextOverride string
 	var yes bool
 	cmd := &cobra.Command{
 		Use:   "create",
 		Short: "Provision a new client for this machine (--name, --location)",
 		Args:  cobra.NoArgs,
 		RunE: func(cmd *cobra.Command, _ []string) error {
-			return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(), name, location, yes)
+			return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(),
+				clientCreateOpts{name: name, location: location, kubeconfigPath: kubeconfigPath, contextOverride: contextOverride, yes: yes})
 		},
 	}
 	cmd.Flags().StringVar(&name, "name", "",
 		"human-readable client name (shown on your dashboard + carbon reports)")
 	cmd.Flags().StringVar(&location, "location", "",
 		"location zone for carbon footprint (e.g. DE); prompted if omitted")
+	cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "",
+		"path to the kubeconfig for the target cluster (default: $KUBECONFIG, then ~/.kube/config) — read to anchor the client to this cluster")
+	cmd.Flags().StringVar(&contextOverride, "context", "",
+		"kubeconfig context for the target cluster (default: current-context)")
 	cmd.Flags().BoolVar(&yes, "yes", false, "skip the confirmation prompt")
 	return cmd
 }
 
+// clientCreateOpts bundles the `client create` inputs (flags + resolved prompts).
+type clientCreateOpts struct {
+	name, location, kubeconfigPath, contextOverride string
+	yes                                             bool
+}
+
 func newClientListCmd() *cobra.Command {
 	return &cobra.Command{
 		Use:     "list",
@@ -102,12 +119,14 @@ func authedClient() (*api.Client, *config.Config, error) {
 	return client, cfg, nil
 }
 
-func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, name, location string, yes bool) error {
+func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clientCreateOpts) error {
 	client, cfg, err := authedClient()
 	if err != nil {
 		return &exitError{code: 1, err: err}
 	}
 
+	name, location := opts.name, opts.location
+
 	// Gather inputs first (flags win; prompt only what's missing, and only on a
 	// TTY), then show one review + confirm — matching the dataset-push flow.
 	if name == "" {
@@ -129,6 +148,16 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, name, loca
 		}
 	}
 
+	// Read the cluster anchor (kube-system UID) so create is get-or-create keyed on
+	// it — re-running on the same cluster adopts the existing client instead of
+	// minting a duplicate (RFC-0001 §7.2 / backend#883). Best-effort + never silent:
+	// if the cluster isn't reachable we provision WITHOUT an anchor (a plain mint)
+	// and say so, rather than blocking.
+	clusterID, cidErr := readClusterID(ctx, cluster.KubeconfigOptions{Path: opts.kubeconfigPath, Context: opts.contextOverride})
+	if cidErr != nil {
+		p.Hintf("Couldn't read the target cluster's identity — provisioning without a cluster anchor, so re-running won't be idempotent. Point --kubeconfig/--context at the reachable cluster to enable that.")
+	}
+
 	// Derive the namespace slug from the name, avoiding collisions with existing
 	// clients (best-effort: if the list call fails we still derive a base slug).
 	var existing []string
@@ -144,8 +173,8 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, name, loca
 		return &exitError{code: 1, err: err}
 	}
 
-	if pr != nil && !yes {
-		renderClientReview(p, name, namespace, location)
+	if pr != nil && !opts.yes {
+		renderClientReview(p, name, namespace, location, clusterID)
 		ok, cerr := pr.Confirm("Provision this client?", true)
 		if cerr != nil {
 			return mapClientErr(cerr)
@@ -157,33 +186,62 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, name, loca
 	}
 
 	// The machine credential: the CLI generates the password, the backend stores
-	// it (write-only). The client-runtime authenticates with username+password.
+	// it (write-only). Sent on every create but used only when minting — on an
+	// idempotent adopt the backend keeps the existing client's credential (§7.2),
+	// so the generated value is never printed in that case.
 	password := randHex(24)
-	pc, err := client.CreateClient(ctx, api.CreateClientRequest{
+	pc, adopted, err := client.CreateClient(ctx, api.CreateClientRequest{
 		Name:      name,
 		Namespace: namespace,
 		Location:  location,
 		Password:  password,
+		ClusterID: clusterID,
 	})
 	if err != nil {
 		var ae *api.APIError
-		if errors.As(err, &ae) && ae.StatusCode == http.StatusForbidden {
-			return askAnAdmin(ctx, p, client)
+		if errors.As(err, &ae) {
+			switch ae.StatusCode {
+			case http.StatusForbidden:
+				return askAnAdmin(ctx, p, client)
+			case http.StatusConflict:
+				// Per RFC C.3 the only 409 on POST /edge-device/ is cluster_conflict
+				// (R6): this cluster_id is bound to another account.
+				return &exitError{code: 1, err: errors.New(
+					"this cluster is already registered to a different tracebloc account — " +
+						"sign in to that account, or ask your admin (cluster_conflict)")}
+			}
 		}
 		return &exitError{code: 1, err: err}
 	}
 
 	cfg.ActiveClientID = strconv.Itoa(pc.ID)
-	if serr := cfg.Save(); serr != nil {
-		return &exitError{code: 1, err: serr}
-	}
 
 	p.Newline()
+	if adopted {
+		// Idempotent re-run: the backend matched this cluster_id to an existing
+		// client and returned it — no new credential. (The existing-fleet R7 case,
+		// where the backend instead matches a live in-cluster TB_CLIENT_ID whose
+		// cluster_id is still null and the CLI backfills it via PATCH, is the
+		// installer's orchestration — #838 — not done here.)
+		p.Successf("This cluster is already registered as client %q (namespace %s) — adopted it.", pc.Name, pc.Namespace)
+		p.Hintf("No new credential issued; the existing one stands. This machine is set to enroll as client %d.", pc.ID)
+		// Mirror the mint path: a config-save failure shouldn't bury the result —
+		// hint how to set the pointer by hand and still exit clean.
+		if serr := cfg.Save(); serr != nil {
+			p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID)
+		}
+		return nil
+	}
+	// Mint: print the credential FIRST — it's the only copy (the backend stores
+	// only the hash), so a later config-save failure must never cost it.
 	p.Successf("Provisioned client %q (namespace %s).", pc.Name, pc.Namespace)
 	p.Section("Machine credential — needed by the installer to connect this client")
 	p.Field("client id", strconv.Itoa(pc.ID))
 	p.Field("username", pc.Username)
 	p.Field("password", password)
+	if serr := cfg.Save(); serr != nil {
+		p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID)
+	}
 	return nil
 }
 
@@ -255,11 +313,14 @@ func runClientUse(ctx context.Context, p *ui.Printer, id string) error {
 
 // renderClientReview shows the assembled inputs before the confirm prompt, so
 // the user sees the derived namespace and location before anything is created.
-func renderClientReview(p *ui.Printer, name, namespace, location string) {
+func renderClientReview(p *ui.Printer, name, namespace, location, clusterID string) {
 	p.Section("Review")
 	p.Field("name", name)
 	p.Field("namespace", namespace)
 	p.Field("location", location)
+	if clusterID != "" {
+		p.Field("cluster", clusterID+"  (anchors this client — re-runs adopt it)")
+	}
 }
 
 // errMissingFlag reports a required flag absent in a non-interactive run (no TTY
diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go
index d4ac0e4..5d77cb7 100644
--- a/internal/cli/client_test.go
+++ b/internal/cli/client_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"net/http"
 	"net/http/httptest"
@@ -11,12 +12,15 @@ import (
 	"testing"
 
 	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/cluster"
 	"github.com/tracebloc/cli/internal/config"
 	"github.com/tracebloc/cli/internal/ui"
 )
 
 // withClientBackend points the client commands at an httptest server (via the
-// newAPIClient seam) and writes a signed-in config to a temp dir.
+// newAPIClient seam) and writes a signed-in config to a temp dir. It also stubs
+// readClusterID to "no cluster" by default, so create tests never touch a real
+// kubeconfig/cluster — tests that exercise the anchor override it via stubClusterID.
 func withClientBackend(t *testing.T, h http.HandlerFunc) {
 	t.Helper()
 	srv := httptest.NewServer(h)
@@ -30,6 +34,22 @@ func withClientBackend(t *testing.T, h http.HandlerFunc) {
 		return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()}
 	}
 	t.Cleanup(func() { newAPIClient = orig })
+
+	origCID := readClusterID
+	readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) {
+		return "", errors.New("no cluster reachable (test default)")
+	}
+	t.Cleanup(func() { readClusterID = origCID })
+}
+
+// stubClusterID overrides the cluster-anchor read for a single test.
+func stubClusterID(t *testing.T, uid string, err error) {
+	t.Helper()
+	orig := readClusterID
+	readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) {
+		return uid, err
+	}
+	t.Cleanup(func() { readClusterID = orig })
 }
 
 func TestClientCreate_Success(t *testing.T) {
@@ -50,7 +70,7 @@ func TestClientCreate_Success(t *testing.T) {
 		}
 	})
 	var out bytes.Buffer
-	if err := runClientCreate(context.Background(), ui.New(&out), nil, "my-client", "DE", true); err != nil {
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true}); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	if body.Namespace != "my-client" || body.Location != "DE" || body.Password == "" {
@@ -78,7 +98,7 @@ func TestClientCreate_AskAnAdmin(t *testing.T) {
 		}
 	})
 	var out bytes.Buffer
-	err := runClientCreate(context.Background(), ui.New(&out), nil, "my-client", "DE", true)
+	err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true})
 	if err == nil || !strings.Contains(err.Error(), "CLIENT_WRITE") {
 		t.Errorf("want permission error, got %v", err)
 	}
@@ -89,7 +109,7 @@ func TestClientCreate_AskAnAdmin(t *testing.T) {
 
 func TestClientCreate_RequiresLogin(t *testing.T) {
 	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) // no config → not signed in
-	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, "x", "DE", true)
+	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "x", location: "DE", yes: true})
 	if err == nil || !strings.Contains(err.Error(), "login") {
 		t.Errorf("want not-signed-in error, got %v", err)
 	}
@@ -149,7 +169,7 @@ func TestClientCreate_Interactive(t *testing.T) {
 		confirm: &confirmYes,
 	}
 	var out bytes.Buffer
-	if err := runClientCreate(context.Background(), ui.New(&out), pr, "", "", false); err != nil {
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil {
 		t.Fatalf("interactive create: %v", err)
 	}
 	if !posted {
@@ -180,7 +200,7 @@ func TestClientCreate_InteractiveCancel(t *testing.T) {
 		confirm: &confirmNo,
 	}
 	var out bytes.Buffer
-	if err := runClientCreate(context.Background(), ui.New(&out), pr, "", "", false); err != nil {
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil {
 		t.Fatalf("declining the confirm should be a clean exit, got: %v", err)
 	}
 	if posted {
@@ -224,10 +244,129 @@ func TestClientCreate_CollisionSuffix(t *testing.T) {
 			_, _ = w.Write([]byte(`{"id":2,"first_name":"My Client","username":"u-2","namespace":"my-client-2","location":"DE"}`))
 		}
 	})
-	if err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, "My Client", "DE", true); err != nil {
+	if err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "My Client", location: "DE", yes: true}); err != nil {
 		t.Fatal(err)
 	}
 	if body.Namespace != "my-client-2" {
 		t.Errorf("namespace = %q, want my-client-2 (collision suffix not applied)", body.Namespace)
 	}
 }
+
+func TestClientCreate_AnchorMint(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated) // 201 = minted
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"c","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.ClusterID != "uid-1" {
+		t.Errorf("cluster_id sent = %q, want uid-1 (anchor not wired into the request)", body.ClusterID)
+	}
+	if !strings.Contains(out.String(), "Machine credential") {
+		t.Errorf("mint should print the credential, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_AdoptIdempotent(t *testing.T) {
+	posts := 0
+	var lastBody api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			posts++
+			_ = json.NewDecoder(r.Body).Decode(&lastBody)
+			w.WriteHeader(http.StatusOK) // 200 = adopted an existing client
+			_, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"existing","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+
+	run := func() string {
+		var out bytes.Buffer
+		if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+			t.Fatalf("adopt: %v", err)
+		}
+		return out.String()
+	}
+	// Two runs against the same cluster — real re-run idempotency: each POSTs once,
+	// both adopt the SAME existing client, neither prints a credential.
+	first, second := run(), run()
+	if posts != 2 {
+		t.Fatalf("posts = %d, want 2 (each run POSTs once)", posts)
+	}
+	for i, out := range []string{first, second} {
+		if !strings.Contains(out, "adopted") {
+			t.Errorf("run %d: adopt should say so, got:\n%s", i, out)
+		}
+		if strings.Contains(out, "Machine credential") {
+			t.Errorf("run %d: adopt must NOT print a credential, got:\n%s", i, out)
+		}
+	}
+	// The credential is still SENT on every create (the backend uses it only on a
+	// mint, §7.2) even though it's never printed on adopt.
+	if lastBody.Password == "" {
+		t.Error("password should still be sent in the adopt POST body")
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "8" {
+		t.Errorf("active client = %q, want 8 (adopted id)", cfg.ActiveClientID)
+	}
+}
+
+func TestClientCreate_ClusterConflict(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusConflict) // 409 = bound to another account (R6)
+			_, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "c", location: "DE", yes: true})
+	if err == nil || !strings.Contains(err.Error(), "different tracebloc account") {
+		t.Errorf("want a cluster_conflict error, got %v", err)
+	}
+}
+
+func TestClientCreate_NoClusterAnchorWarns(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":3,"first_name":"c","username":"u-3","namespace":"c","location":"DE"}`))
+		}
+	})
+	// readClusterID left at the withClientBackend default (returns an error).
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.ClusterID != "" {
+		t.Errorf("cluster_id sent = %q, want empty (no anchor when cluster unreadable)", body.ClusterID)
+	}
+	if !strings.Contains(out.String(), "without a cluster anchor") {
+		t.Errorf("expected a never-silent hint about the missing anchor, got:\n%s", out.String())
+	}
+	// The no-anchor path must still complete a full mint — the credential is shown.
+	if !strings.Contains(out.String(), "Machine credential") {
+		t.Errorf("no-anchor fallback should still print the credential, got:\n%s", out.String())
+	}
+}
diff --git a/internal/cluster/identity.go b/internal/cluster/identity.go
new file mode 100644
index 0000000..f33f14c
--- /dev/null
+++ b/internal/cluster/identity.go
@@ -0,0 +1,50 @@
+package cluster
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+)
+
+// clusterIDReadTimeout bounds the best-effort anchor read in ClusterID: a
+// kubeconfig pointing at an unreachable API server would otherwise hang the
+// kube-system GET for the OS TCP timeout, stalling a `client create` that is
+// meant to degrade to a non-anchored mint instead.
+const clusterIDReadTimeout = 8 * time.Second
+
+// ClusterID reads the kube-system namespace UID — the stable per-cluster
+// fingerprint RFC-0001 keys client idempotency on (§6.3 / §7.2; backend#883).
+// It needs a reachable cluster and RBAC to GET namespaces/kube-system. Callers
+// treat a failure as "couldn't read the cluster identity" and fall back to a
+// non-anchored (dual-mode) provision — they must not block on it.
+func ClusterID(ctx context.Context, opts KubeconfigOptions) (string, error) {
+	rc, err := Load(opts)
+	if err != nil {
+		return "", err
+	}
+	// Best-effort read — callers must not block on it (see the doc above), so cap
+	// it; otherwise an unreachable API server hangs the GET below.
+	rc.RestConfig.Timeout = clusterIDReadTimeout
+	cs, err := NewClientset(rc)
+	if err != nil {
+		return "", err
+	}
+	return clusterIDFrom(ctx, cs)
+}
+
+// clusterIDFrom reads the kube-system UID from a clientset. Split out so it can be
+// exercised with a fake clientset without a real cluster.
+func clusterIDFrom(ctx context.Context, cs kubernetes.Interface) (string, error) {
+	ns, err := cs.CoreV1().Namespaces().Get(ctx, "kube-system", metav1.GetOptions{})
+	if err != nil {
+		return "", fmt.Errorf("reading kube-system namespace UID: %w", err)
+	}
+	uid := string(ns.UID)
+	if uid == "" {
+		return "", fmt.Errorf("kube-system namespace has no UID")
+	}
+	return uid, nil
+}
diff --git a/internal/cluster/identity_test.go b/internal/cluster/identity_test.go
new file mode 100644
index 0000000..63c6d46
--- /dev/null
+++ b/internal/cluster/identity_test.go
@@ -0,0 +1,30 @@
+package cluster
+
+import (
+	"context"
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+func TestClusterIDFrom(t *testing.T) {
+	cs := fake.NewClientset(&corev1.Namespace{
+		ObjectMeta: metav1.ObjectMeta{Name: "kube-system", UID: "uid-abc-123"},
+	})
+	got, err := clusterIDFrom(context.Background(), cs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got != "uid-abc-123" {
+		t.Errorf("cluster id = %q, want uid-abc-123", got)
+	}
+}
+
+func TestClusterIDFrom_NoNamespace(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster — no kube-system
+	if _, err := clusterIDFrom(context.Background(), cs); err == nil {
+		t.Error("expected an error when kube-system is absent")
+	}
+}

From 722d2cf313fc09f79dd0f97aa65bb454a3b5dcac Mon Sep 17 00:00:00 2001
From: lukasWuttke <54042461+LukasWodka@users.noreply.github.com>
Date: Wed, 24 Jun 2026 16:01:46 +0200
Subject: [PATCH 12/15] =?UTF-8?q?feat(cli):=20client=20create=20--credenti?=
 =?UTF-8?q?al-file=20=E2=80=94=20write=20the=20credential=20for=20the=20in?=
 =?UTF-8?q?staller=20(#84)=20(#104)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(cli): client create --credential-file — write the machine credential for the installer (#84)

Adds `tracebloc client create --credential-file PATH`: instead of printing the
minted credential, write it to PATH (mode 0600) as a sourceable env file the
installer reorder (#838) consumes — the secret never hits the terminal (RFC §9
"secure by invisibility" / never-show, deferred here from cli#102).

- Mint (201): writes TRACEBLOC_CLIENT_ID + TRACEBLOC_CLIENT_PASSWORD + TB_NAMESPACE
  (0600) and suppresses the stdout credential print. Write failure is fatal (the
  credential is the only copy).
- Adopt (200): writes TRACEBLOC_CLIENT_ID + TB_NAMESPACE + TRACEBLOC_CLIENT_ADOPTED=1
  (no password — the existing one stands, write-only on the backend); the installer
  reconciles the existing release rather than expecting a fresh credential.
- Without the flag: behaviour unchanged (the interim credential print).

Unblocks the #838 installer reorder (login -> create -> feed the chart). Tests cover
mint (0600 + sourceable + never-printed) and adopt (id+ns+marker, no password).
go build/vet/test ./... green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cli): force 0600 on credential file via temp+rename (#84)

writeClientCredential used os.WriteFile(path, ..., 0o600), but WriteFile
only applies its perm bits when it *creates* the file — over a pre-existing
target it truncates and writes WITHOUT changing the mode. So a stale file, or
one an attacker pre-creates world-readable, at --credential-file would receive
the minted password (the only copy) at its old, possibly 0644 mode — silently
breaking the flag's own 0600/never-show contract (RFC §9). Verified: a 0644
target stays 0644 after the write.

Write to a 0600 temp file in the target dir and atomically rename over the
path instead. CreateTemp is 0600 by construction, so the guarantee holds
unconditionally; rename is atomic (no half-written credential) and the final
write never follows a symlink planted at the target.

Tests:
- preexisting-perms: a 0644 target ends up 0600 (locks in the fix).
- write-fail-fatal: an unwritable target surfaces an error, never a silent
  drop (the credential is the only copy).
- mint never-show: also assert the password VALUE is absent from stdout, not
  just the literal "password"/"Machine credential" strings.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Co-authored-by: Asad Iqbal <asad.dsoft@gmail.com>
---
 internal/cli/client.go      |  99 ++++++++++++++++++++---
 internal/cli/client_test.go | 151 ++++++++++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+), 10 deletions(-)

diff --git a/internal/cli/client.go b/internal/cli/client.go
index 481ddf7..c4d06a3 100644
--- a/internal/cli/client.go
+++ b/internal/cli/client.go
@@ -7,6 +7,8 @@ import (
 	"errors"
 	"fmt"
 	"net/http"
+	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 
@@ -39,7 +41,7 @@ in your account.  Requires sign-in first (` + "`tracebloc login`" + `).`,
 }
 
 func newClientCreateCmd() *cobra.Command {
-	var name, location, kubeconfigPath, contextOverride string
+	var name, location, kubeconfigPath, contextOverride, credentialFile string
 	var yes bool
 	cmd := &cobra.Command{
 		Use:   "create",
@@ -47,7 +49,7 @@ func newClientCreateCmd() *cobra.Command {
 		Args:  cobra.NoArgs,
 		RunE: func(cmd *cobra.Command, _ []string) error {
 			return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(),
-				clientCreateOpts{name: name, location: location, kubeconfigPath: kubeconfigPath, contextOverride: contextOverride, yes: yes})
+				clientCreateOpts{name: name, location: location, kubeconfigPath: kubeconfigPath, contextOverride: contextOverride, credentialFile: credentialFile, yes: yes})
 		},
 	}
 	cmd.Flags().StringVar(&name, "name", "",
@@ -58,14 +60,16 @@ func newClientCreateCmd() *cobra.Command {
 		"path to the kubeconfig for the target cluster (default: $KUBECONFIG, then ~/.kube/config) — read to anchor the client to this cluster")
 	cmd.Flags().StringVar(&contextOverride, "context", "",
 		"kubeconfig context for the target cluster (default: current-context)")
+	cmd.Flags().StringVar(&credentialFile, "credential-file", "",
+		"write the machine credential to this path (mode 0600, sourceable env) instead of printing it — for the installer to feed the chart (never shown on the terminal)")
 	cmd.Flags().BoolVar(&yes, "yes", false, "skip the confirmation prompt")
 	return cmd
 }
 
 // clientCreateOpts bundles the `client create` inputs (flags + resolved prompts).
 type clientCreateOpts struct {
-	name, location, kubeconfigPath, contextOverride string
-	yes                                             bool
+	name, location, kubeconfigPath, contextOverride, credentialFile string
+	yes                                                             bool
 }
 
 func newClientListCmd() *cobra.Command {
@@ -225,6 +229,20 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clien
 		// installer's orchestration — #838 — not done here.)
 		p.Successf("This cluster is already registered as client %q (namespace %s) — adopted it.", pc.Name, pc.Namespace)
 		p.Hintf("No new credential issued; the existing one stands. This machine is set to enroll as client %d.", pc.ID)
+		if opts.credentialFile != "" {
+			// No password to hand over on adopt (it's write-only on the backend and
+			// the existing one stands). Emit id + namespace + an ADOPTED marker so the
+			// installer reconciles the existing release rather than expecting a fresh
+			// credential (#838).
+			if werr := writeClientCredential(opts.credentialFile, []string{
+				"TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID),
+				"TB_NAMESPACE=" + pc.Namespace,
+				"TRACEBLOC_CLIENT_ADOPTED=1",
+			}); werr != nil {
+				return &exitError{code: 1, err: werr}
+			}
+			p.Hintf("Wrote client id + namespace to %s (no new credential — the existing one stands).", opts.credentialFile)
+		}
 		// Mirror the mint path: a config-save failure shouldn't bury the result —
 		// hint how to set the pointer by hand and still exit clean.
 		if serr := cfg.Save(); serr != nil {
@@ -232,19 +250,80 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clien
 		}
 		return nil
 	}
-	// Mint: print the credential FIRST — it's the only copy (the backend stores
-	// only the hash), so a later config-save failure must never cost it.
+	// Mint. With --credential-file the secret goes to a 0600 file (never the
+	// terminal — RFC §9 "secure by invisibility") for the installer to source;
+	// otherwise it's printed (the interim, until the installer drives this).
 	p.Successf("Provisioned client %q (namespace %s).", pc.Name, pc.Namespace)
-	p.Section("Machine credential — needed by the installer to connect this client")
-	p.Field("client id", strconv.Itoa(pc.ID))
-	p.Field("username", pc.Username)
-	p.Field("password", password)
+	if opts.credentialFile != "" {
+		if werr := writeClientCredential(opts.credentialFile, []string{
+			"TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID),
+			"TRACEBLOC_CLIENT_PASSWORD=" + password,
+			"TB_NAMESPACE=" + pc.Namespace,
+		}); werr != nil {
+			// The credential is the only copy — a write failure must be fatal, not a
+			// silent drop (the installer would have nothing to connect with).
+			return &exitError{code: 1, err: werr}
+		}
+		p.Hintf("Credential written to %s (mode 0600, not shown). This machine is set to enroll as client %d.", opts.credentialFile, pc.ID)
+	} else {
+		// Print the credential FIRST — it's the only copy (the backend stores only
+		// the hash), so a later config-save failure must never cost it.
+		p.Section("Machine credential — needed by the installer to connect this client")
+		p.Field("client id", strconv.Itoa(pc.ID))
+		p.Field("username", pc.Username)
+		p.Field("password", password)
+	}
 	if serr := cfg.Save(); serr != nil {
 		p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID)
 	}
 	return nil
 }
 
+// writeClientCredential writes the machine credential to path (mode 0600) as a
+// shell-sourceable env file — the installer (#838) sources it to feed the chart,
+// so the secret lands in a 0600 file, never the terminal (RFC §9 never-show). The
+// values are constrained charsets (numeric id, hex password, DNS-1123 slug), so
+// no shell-escaping is needed.
+//
+// Written via a 0600 temp file + atomic rename rather than os.WriteFile: WriteFile
+// only applies its perm bits when it *creates* the file, so a pre-existing target
+// (a stale file, or one an attacker pre-creates world-readable) would keep its old
+// mode and leak the secret — the 0600 guarantee must hold unconditionally. The temp
+// also avoids following a symlink at the target and never leaves a half-written
+// credential behind.
+func writeClientCredential(path string, lines []string) error {
+	dir := filepath.Dir(path)
+	if dir != "" && dir != "." {
+		if err := os.MkdirAll(dir, 0o700); err != nil {
+			return fmt.Errorf("creating credential-file directory: %w", err)
+		}
+	}
+	body := "# tracebloc client credential — written by `tracebloc client create`.\n" +
+		"# Mode 0600; sourced by the installer. Do not commit or share.\n" +
+		strings.Join(lines, "\n") + "\n"
+	// CreateTemp makes the file 0600 by construction, in the target dir so the
+	// rename stays on one filesystem.
+	f, err := os.CreateTemp(dir, ".cred-*")
+	if err != nil {
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	tmp := f.Name()
+	if _, err := f.WriteString(body); err != nil {
+		_ = f.Close()
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	if err := f.Close(); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	return nil
+}
+
 // askAnAdmin renders the "you can't provision — here's who can" path (a 403 from
 // the backend means no CLIENT_WRITE; backend#836 Q4).
 func askAnAdmin(ctx context.Context, p *ui.Printer, client *api.Client) error {
diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go
index 5d77cb7..bd79021 100644
--- a/internal/cli/client_test.go
+++ b/internal/cli/client_test.go
@@ -8,6 +8,8 @@ import (
 	"fmt"
 	"net/http"
 	"net/http/httptest"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 
@@ -325,6 +327,155 @@ func TestClientCreate_AdoptIdempotent(t *testing.T) {
 	}
 }
 
+func TestClientCreate_CredentialFileMint(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	// never-show: the secret must NOT hit the terminal.
+	if strings.Contains(out.String(), "Machine credential") || strings.Contains(out.String(), "password") {
+		t.Errorf("credential must not be printed when --credential-file is set, got:\n%s", out.String())
+	}
+	// the file is 0600 and carries the sourceable credential.
+	info, err := os.Stat(credPath)
+	if err != nil {
+		t.Fatalf("credential file not written: %v", err)
+	}
+	if perm := info.Mode().Perm(); perm != 0o600 {
+		t.Errorf("credential file mode = %o, want 600", perm)
+	}
+	kv := parseEnvFile(t, credPath)
+	if kv["TRACEBLOC_CLIENT_ID"] != "5" || kv["TB_NAMESPACE"] != "my-ns" || kv["TRACEBLOC_CLIENT_PASSWORD"] == "" {
+		t.Errorf("credential file = %v (want id=5, ns=my-ns, non-empty password)", kv)
+	}
+	// never-show, the real invariant: the minted password VALUE must not appear
+	// in stdout under any label (the string checks above are just a proxy).
+	if strings.Contains(out.String(), kv["TRACEBLOC_CLIENT_PASSWORD"]) {
+		t.Errorf("minted password leaked to the terminal:\n%s", out.String())
+	}
+}
+
+// TestClientCreate_CredentialFilePreexistingPerms locks in the 0600 guarantee
+// when the target already exists with looser perms — os.WriteFile would have
+// kept the stale mode and leaked the secret group/other-readable.
+func TestClientCreate_CredentialFilePreexistingPerms(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	// A stale, world-readable file already sits at the target path.
+	if err := os.WriteFile(credPath, []byte("stale\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	info, err := os.Stat(credPath)
+	if err != nil {
+		t.Fatalf("credential file not written: %v", err)
+	}
+	if perm := info.Mode().Perm(); perm != 0o600 {
+		t.Errorf("credential file mode = %o over a pre-existing 0644 target, want 600", perm)
+	}
+}
+
+// TestClientCreate_CredentialFileWriteFailFatal asserts a credential-file write
+// failure is fatal — the minted password is the only copy, so a failed write must
+// surface an error, never a silent drop. The target's parent is a regular file, so
+// the directory create (hence the write) fails deterministically.
+func TestClientCreate_CredentialFileWriteFailFatal(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	notADir := filepath.Join(t.TempDir(), "iam-a-file")
+	if err := os.WriteFile(notADir, []byte("x"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+	credPath := filepath.Join(notADir, "cred.env") // parent is a file → write fails
+	var out bytes.Buffer
+	err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath})
+	if err == nil {
+		t.Fatal("expected a fatal error when the credential file can't be written, got nil")
+	}
+}
+
+func TestClientCreate_CredentialFileAdopt(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK) // adopt
+			_, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"ex-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("adopt: %v", err)
+	}
+	kv := parseEnvFile(t, credPath)
+	// adopt emits id + namespace + the ADOPTED marker, but NO password (the
+	// existing one stands; it's write-only on the backend).
+	if kv["TRACEBLOC_CLIENT_ID"] != "8" || kv["TB_NAMESPACE"] != "ex-ns" || kv["TRACEBLOC_CLIENT_ADOPTED"] != "1" {
+		t.Errorf("adopt credential file = %v (want id=8, ns=ex-ns, ADOPTED=1)", kv)
+	}
+	if _, hasPw := kv["TRACEBLOC_CLIENT_PASSWORD"]; hasPw {
+		t.Errorf("adopt must not write a password (none issued), got:\n%v", kv)
+	}
+}
+
+// parseEnvFile reads a KEY=value env file (skipping # comments) into a map.
+func parseEnvFile(t *testing.T, path string) map[string]string {
+	t.Helper()
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read credential file: %v", err)
+	}
+	kv := map[string]string{}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		if k, v, ok := strings.Cut(line, "="); ok {
+			kv[k] = v
+		}
+	}
+	return kv
+}
+
 func TestClientCreate_ClusterConflict(t *testing.T) {
 	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
 		switch {

From f7bc32b13f77b7f2856b93eae9451c4f82b48233 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Wed, 24 Jun 2026 19:40:55 +0500
Subject: [PATCH 13/15] fix(cli/api): address Bugbot findings on the v0.4.0 RC
 (#106) (#108)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(api): error on unparseable pagination next, don't silently truncate (#106)

ListClients followed DRF `next` via nextPath, which returned "" for BOTH an
empty link (end of pages) and an unparseable one — so a non-empty `next` the
server sends that url.Parse rejects silently ended the loop, and ListClients
returned only the pages seen so far with a nil error. list / `use` /
namespace-collision checks would then miss clients with no signal.

nextPath now returns (string, error): "" + nil for an empty link, an error for
a non-empty link that won't parse. Trigger is unlikely (DRF emits well-formed
URLs, url.Parse is lenient), but the failure mode — silent partial list — is
the wrong one for a correctness-sensitive call.

Tests: pagination still followed end-to-end (page 1 → 2 → done); an
unparseable next link is now a hard error, not a truncation.

Bugbot: 8dadb5c2-804a-48ed-bc81-eb14e6317be1 (v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* fix(cli): route all known-but-unsupported categories to the pending note (#106)

The dataset-push category gate only special-cased known-but-unsupported *image*
categories (`case push.IsImage`), so a registry-known non-image category that
isn't CLI-supported yet — `causal_language_modeling` (FamilyText,
CLISupported:false, with a real UnsupportedNote) — fell to the default branch
and was reported as "isn't a recognized task category". It IS recognized; it's
pending support.

Swap the gate to `case push.IsKnown`: supported categories are already caught
by the prior case, so IsKnown here means known-but-unsupported (image or text),
all routed through the registry's per-category pending-support note. The default
branch is left for genuinely unknown/typo'd categories. Message-only (exit code
was already 2).

Test: causal_language_modeling now gets the pending-support note, not the
unrecognized-category message. (The existing exit-2 test didn't assert the
message, which is how this slipped through.)

Bugbot: 16f5b945-5d67-4201-8bc6-1f6baf633672 (v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 internal/api/client.go       | 22 ++++++++++++--------
 internal/api/client_test.go  | 40 ++++++++++++++++++++++++++++++++++++
 internal/cli/dataset.go      | 11 ++++++----
 internal/cli/dataset_test.go | 31 ++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/internal/api/client.go b/internal/api/client.go
index fbd97bb..5ff51cc 100644
--- a/internal/api/client.go
+++ b/internal/api/client.go
@@ -346,26 +346,32 @@ func (c *Client) ListClients(ctx context.Context) ([]ProvisionedClient, error) {
 			return nil, fmt.Errorf("decoding client list: %w", err)
 		}
 		all = append(all, body.Results...)
-		path = nextPath(body.Next)
+		next, perr := nextPath(body.Next)
+		if perr != nil {
+			return nil, perr
+		}
+		path = next
 	}
 	return all, nil
 }
 
 // nextPath reduces a DRF `next` link (an absolute URL) to the path+query this
-// client appends to BaseURL. Returns "" for an empty/unparseable link, which
-// ends the pagination loop.
-func nextPath(next string) string {
+// client appends to BaseURL. An empty link returns ("", nil) — the normal
+// end of pages. A non-empty link that won't parse is an error, NOT a silent
+// "", so the loop never quietly stops mid-list and returns only the pages
+// seen so far (list / `use` / collision checks must see every client).
+func nextPath(next string) (string, error) {
 	if next == "" {
-		return ""
+		return "", nil
 	}
 	u, err := url.Parse(next)
 	if err != nil {
-		return ""
+		return "", fmt.Errorf("client list: unparseable pagination link %q: %w", next, err)
 	}
 	if u.RawQuery != "" {
-		return u.Path + "?" + u.RawQuery
+		return u.Path + "?" + u.RawQuery, nil
 	}
-	return u.Path
+	return u.Path, nil
 }
 
 // ListClientAdmins returns who in the account can provision (the ask-an-admin
diff --git a/internal/api/client_test.go b/internal/api/client_test.go
index f5fbdcd..8cad599 100644
--- a/internal/api/client_test.go
+++ b/internal/api/client_test.go
@@ -194,3 +194,43 @@ func TestCreateClientConflict(t *testing.T) {
 		t.Errorf("want APIError 409, got %v", err)
 	}
 }
+
+// TestListClients_FollowsPagination guards that DRF pagination is still
+// followed end-to-end after the nextPath refactor (page 1 → page 2 → done).
+func TestListClients_FollowsPagination(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Query().Get("page") == "2" {
+			_, _ = w.Write([]byte(`{"next":"","results":[{"id":2,"first_name":"b","namespace":"b"}]}`))
+			return
+		}
+		// DRF emits an absolute `next`; nextPath keeps only path+query.
+		_, _ = w.Write([]byte(`{"next":"http://x/edge-device/?page=2","results":[{"id":1,"first_name":"a","namespace":"a"}]}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	got, err := c.ListClients(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 {
+		t.Fatalf("want 2 clients [1,2], got %+v", got)
+	}
+}
+
+// TestListClients_UnparseableNextLink_IsError pins the Bugbot fix (v0.4.0 RC):
+// a non-empty `next` the server sends that url.Parse rejects must be a hard
+// error, never a silent truncation to the pages seen so far — otherwise list /
+// `use` / namespace-collision checks would miss clients without any error.
+func TestListClients_UnparseableNextLink_IsError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		// `next` carries a control byte (\u007f) → url.Parse fails.
+		_, _ = w.Write([]byte(`{"next":"http://x/\u007f","results":[{"id":1,"first_name":"a","namespace":"a"}]}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	if _, err := c.ListClients(context.Background()); err == nil {
+		t.Fatal("expected an error on an unparseable next link, got nil (silent truncation)")
+	}
+}
diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go
index 8cb40ba..012fd2c 100644
--- a/internal/cli/dataset.go
+++ b/internal/cli/dataset.go
@@ -423,10 +423,13 @@ contributors train against it without ever seeing the raw files.`))
 		// "category is required" error downstream.
 	case push.IsCLISupported(a.Spec.Category):
 		// supported
-	case push.IsImage(a.Spec.Category):
-		// A known image category dataset push doesn't implement yet
-		// (semantic_segmentation / instance_segmentation). The per-category
-		// reason + the supported list both come from the registry.
+	case push.IsKnown(a.Spec.Category):
+		// A recognized category dataset push doesn't implement yet — image
+		// (semantic_segmentation / instance_segmentation) or text
+		// (causal_language_modeling). Routed here (not the default branch) so the
+		// user gets the registry's per-category pending-support reason, not a
+		// misleading "unrecognized category". Supported categories were already
+		// caught above, so IsKnown here means known-but-unsupported.
 		spec, _ := push.Lookup(a.Spec.Category)
 		return &exitError{code: 2, err: fmt.Errorf(
 			"category %q isn't supported by the CLI yet (%s). Supported categories: %s.",
diff --git a/internal/cli/dataset_test.go b/internal/cli/dataset_test.go
index 5d5cfcf..b8415b5 100644
--- a/internal/cli/dataset_test.go
+++ b/internal/cli/dataset_test.go
@@ -93,6 +93,37 @@ func TestDatasetPush_UnsupportedCategory_ExitsTwo(t *testing.T) {
 	}
 }
 
+// TestDatasetPush_KnownUnsupportedCategory_PendingNote pins the Bugbot fix
+// (v0.4.0 RC): a registry-known but CLI-unsupported NON-image category
+// (causal_language_modeling) must get the registry's pending-support note, not
+// the misleading "isn't a recognized task category" message. execDatasetPush
+// discards the error and SilenceErrors swallows it, so run the command here and
+// inspect the returned error directly.
+func TestDatasetPush_KnownUnsupportedCategory_PendingNote(t *testing.T) {
+	root := imgcLayout(t)
+	rootCmd := NewRootCmd(BuildInfo{Version: "test"})
+	rootCmd.SetOut(&bytes.Buffer{})
+	rootCmd.SetErr(&bytes.Buffer{})
+	rootCmd.SetArgs([]string{"dataset", "push",
+		"--kubeconfig=/tmp/tracebloc-cli-test-nonexistent-" + t.Name(),
+		root, "--table=t1", "--category=causal_language_modeling",
+		"--intent=train", "--label-column=label"})
+	err := rootCmd.Execute()
+	if err == nil {
+		t.Fatal("expected an error for a known-but-unsupported category")
+	}
+	if got := ExitCodeFromError(err); got != 2 {
+		t.Fatalf("exit code = %d, want 2", got)
+	}
+	msg := err.Error()
+	if strings.Contains(msg, "isn't a recognized task category") {
+		t.Errorf("known category misrouted to the unrecognized-category branch:\n%s", msg)
+	}
+	if !strings.Contains(msg, "isn't supported by the CLI yet") {
+		t.Errorf("want the registry pending-support note, got:\n%s", msg)
+	}
+}
+
 // TestDatasetPush_TraversalTableName_ExitsTwo is the security
 // regression pin at the CLI layer. --table=../../etc must be
 // rejected with exit 2 BEFORE any spec synthesis or cluster work —

From 38817462bbe74b6e0130c68322c1c36a84479de6 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Wed, 24 Jun 2026 19:56:50 +0500
Subject: [PATCH 14/15] fix(cli/push): address round-2 Bugbot findings on the
 v0.4.0 RC (#106) (#109)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(cli): clear active_client_id on logout (#106)

logout cleared the token and email but left active_client_id in
~/.tracebloc/config.json. Since that pointer is account-scoped, a later
`login` as a different user inherited the stale id — `auth status` and
`client list` would surface the previous account's active client until the
user ran `client create`/`client use` again.

Clear it alongside the token/email so logout fully drops local session state.

Bugbot: "Stale active client after logout" (Medium, v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* fix(push): add token_classification to the registry + pin schema parity (#106)

The re-vendored ingest.v1.json (#103) accepts token_classification, but the
category registry didn't list it — so `dataset push --category=token_classification`
hit the "isn't a recognized task category" path despite being schema-valid (the
same misrouting just fixed for causal_language_modeling). Added it as a known,
not-yet-CLI-supported FamilyText category with a pending-support note (the safe
default — it was never pushable, so this only improves the message; flip to
supported when the texts/token-label staging lands).

Root-cause guard: the existing registry tests only pinned the registry against
a hand-written list, which stayed self-consistent while drifting from the
schema. Added TestRegistryCoversSchemaCategories — parses the embedded schema's
category enum and asserts every entry is registry-known — so any future
schema-only category is caught here, not in the next review pass. (The reverse,
a registry-only known-unsupported alias like instance_segmentation, is allowed:
it's gated out before schema validation.)

Bugbot: "Missing token_classification registry entry" (Medium, v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 internal/cli/auth.go                    |  5 ++++
 internal/cli/auth_test.go               |  7 ++++-
 internal/push/category.go               |  2 ++
 internal/push/category_registry_test.go | 39 +++++++++++++++++++++++--
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/internal/cli/auth.go b/internal/cli/auth.go
index 6f15965..9819d29 100644
--- a/internal/cli/auth.go
+++ b/internal/cli/auth.go
@@ -151,6 +151,11 @@ func newLogoutCmd() *cobra.Command {
 			}
 			cfg.Token = ""
 			cfg.Email = ""
+			// Also drop the active-client pointer: it's account-scoped, so leaving
+			// it would bleed into the next account's session (a later `login` as a
+			// different user would inherit a stale active client in `auth status` /
+			// `client list`).
+			cfg.ActiveClientID = ""
 			if err := cfg.Save(); err != nil {
 				return &exitError{code: 1, err: err}
 			}
diff --git a/internal/cli/auth_test.go b/internal/cli/auth_test.go
index 5be742b..d6b2460 100644
--- a/internal/cli/auth_test.go
+++ b/internal/cli/auth_test.go
@@ -119,7 +119,7 @@ func TestLogin_Denied(t *testing.T) {
 
 func TestLogout(t *testing.T) {
 	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
-	if err := (&config.Config{Token: "x", Email: "e@co"}).Save(); err != nil {
+	if err := (&config.Config{Token: "x", Email: "e@co", ActiveClientID: "7"}).Save(); err != nil {
 		t.Fatal(err)
 	}
 	out, err := runCmd(t, "logout")
@@ -130,6 +130,11 @@ func TestLogout(t *testing.T) {
 	if cfg.SignedIn() {
 		t.Error("expected to be signed out")
 	}
+	// The active-client pointer is account-scoped — it must not survive logout,
+	// or it bleeds into the next account's session.
+	if cfg.ActiveClientID != "" {
+		t.Errorf("active_client_id = %q after logout, want cleared", cfg.ActiveClientID)
+	}
 	if !strings.Contains(out, "Signed out") {
 		t.Errorf("got:\n%s", out)
 	}
diff --git a/internal/push/category.go b/internal/push/category.go
index b48640d..73f0ffd 100644
--- a/internal/push/category.go
+++ b/internal/push/category.go
@@ -70,6 +70,8 @@ var categoryRegistry = []CategorySpec{
 		UnsupportedNote: "not implemented"},
 	{ID: "causal_language_modeling", Family: FamilyText, Label: "Causal language modeling", CLISupported: false,
 		UnsupportedNote: "schema-recognized (data-ingestors#805); `tracebloc ingest` discover/build for its raw-.txt / prompt\\tcompletion `texts` layout is pending"},
+	{ID: "token_classification", Family: FamilyText, Label: "Token classification", CLISupported: false,
+		UnsupportedNote: "schema-recognized; the CLI doesn't stage its per-token-label `texts` layout yet"},
 }
 
 // categoryByID indexes the registry for O(1) lookup, built once.
diff --git a/internal/push/category_registry_test.go b/internal/push/category_registry_test.go
index 997fc2e..17ce1f3 100644
--- a/internal/push/category_registry_test.go
+++ b/internal/push/category_registry_test.go
@@ -1,8 +1,11 @@
 package push
 
 import (
+	"encoding/json"
 	"sort"
 	"testing"
+
+	"github.com/tracebloc/cli/internal/schema"
 )
 
 // The registry is the single source of truth; these pin its contents and
@@ -13,7 +16,8 @@ func TestRegistryKnownCategories(t *testing.T) {
 	want := []string{
 		"image_classification", "object_detection", "keypoint_detection",
 		"semantic_segmentation", "instance_segmentation",
-		"text_classification", "masked_language_modeling", "causal_language_modeling",
+		"text_classification", "token_classification",
+		"masked_language_modeling", "causal_language_modeling",
 		"tabular_classification", "tabular_regression",
 		"time_series_forecasting", "time_to_event_prediction",
 	}
@@ -42,7 +46,7 @@ func TestSupportedCategories(t *testing.T) {
 	}
 	// segmentation + causal_language_modeling are known but not yet pushable,
 	// and must explain why.
-	for _, id := range []string{"semantic_segmentation", "instance_segmentation", "causal_language_modeling"} {
+	for _, id := range []string{"semantic_segmentation", "instance_segmentation", "causal_language_modeling", "token_classification"} {
 		if !IsKnown(id) {
 			t.Errorf("%s should be known", id)
 		}
@@ -82,6 +86,37 @@ func TestPredicatesDeriveFromRegistry(t *testing.T) {
 	}
 }
 
+// TestRegistryCoversSchemaCategories pins registry⇄schema parity: every
+// category the ingest schema accepts must be known to the registry, or a
+// schema-valid `dataset push --category=X` is wrongly rejected as
+// "unrecognized" (the token_classification drift, Bugbot v0.4.0 RC). The
+// existing tests only pin the registry against a hand-written list, which
+// stays internally consistent while drifting from the schema — this closes
+// that gap. The reverse direction isn't required: the registry may carry a
+// known-but-unsupported alias the v1 schema doesn't list yet (e.g.
+// instance_segmentation), which is gated out before schema validation.
+func TestRegistryCoversSchemaCategories(t *testing.T) {
+	var doc struct {
+		Properties struct {
+			Category struct {
+				Enum []string `json:"enum"`
+			} `json:"category"`
+		} `json:"properties"`
+	}
+	if err := json.Unmarshal(schema.V1Bytes, &doc); err != nil {
+		t.Fatalf("parse embedded schema: %v", err)
+	}
+	if len(doc.Properties.Category.Enum) == 0 {
+		t.Fatal("no category enum found in the embedded schema (parse path wrong?)")
+	}
+	for _, id := range doc.Properties.Category.Enum {
+		if !IsKnown(id) {
+			t.Errorf("schema category %q missing from the registry — `dataset push --category=%s` "+
+				"would be rejected as unrecognized despite passing schema validation", id, id)
+		}
+	}
+}
+
 func equalSet(a, b []string) bool {
 	if len(a) != len(b) {
 		return false

From 818db1c313bfa67bda315ccc3c206a0fd0d1b7d2 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Wed, 24 Jun 2026 20:25:40 +0500
Subject: [PATCH 15/15] =?UTF-8?q?fix:=20proactive=20v0.4.0=20RC=20review?=
 =?UTF-8?q?=20=E2=80=94=20fix=204=20findings,=20document=203=20non-issues?=
 =?UTF-8?q?=20(#106)=20(#110)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(cli): back off device-flow poll by 5s on slow_down (RFC 8628) (#106)

On `slow_down`, runLogin bumped the poll interval by 1s (`interval++`). RFC 8628
§3.5 requires increasing it by 5s for that and all subsequent polls, so the CLI
kept polling too aggressively after the server asked it to back off.

Test captures the durations handed to the pollAfter seam: post-slow_down wait is
now 10s (5+5), not 6s.

Bugbot: "Device flow slow_down interval" (Medium, v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* fix(doctor): resolve CLIENT_ENV case-insensitively in backendHost (#106)

backendHost switched on CLIENT_ENV case-sensitively, but the API client
(api.ResolveEnv/BaseURL) lowercases env values. A non-lowercase CLIENT_ENV on
the edge box (e.g. "DEV") fell through to the prod default, so `cluster doctor`
probed api.tracebloc.io even when the cluster targeted dev/stg. Normalize with
ToLower+TrimSpace before the switch.

Test extends TestBackendHost with "DEV"/"Stg"/" dev " cases.

Bugbot: "Doctor backend env casing" (Low, v0.4.0 RC, #107)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* fix(api): guard the bare-array list decode to the first page (#106)

ListClients attempted the unpaginated bare-array decode on every iteration. A
bare array is only valid as the sole response (a paginated chain is a
{next,results} object on every page), so a stray bare body mid-chain could
silently end the loop and drop earlier pages. Guard the bare decode to pageNum 0.

Test: a bare-array response still returns the full list.

Found in the proactive RC review (low; latent — DRF doesn't mix shapes).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* fix(cli): exclude this cluster's own client from create collision check (#106)

`client create` built the namespace-collision set from ALL clients, including the
one already anchored to this cluster. On an idempotent re-run with the same
--name, that bumped the derived slug (lab-one → lab-one-2) and showed it in the
review — but the backend adopts on cluster_id and returns the original namespace,
so the review contradicted the actual outcome. Skip the client whose cluster_id
matches this cluster's anchor.

Test: a re-run review no longer shows a bumped namespace.

Found in the proactive RC review (low; cosmetic — backend state was correct).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 internal/api/client.go         | 12 ++++++---
 internal/api/client_test.go    | 19 +++++++++++++
 internal/cli/auth.go           |  4 ++-
 internal/cli/auth_test.go      | 49 ++++++++++++++++++++++++++++++++++
 internal/cli/client.go         |  7 +++++
 internal/cli/client_test.go    | 28 +++++++++++++++++++
 internal/doctor/doctor.go      |  5 +++-
 internal/doctor/doctor_test.go |  5 ++++
 8 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/internal/api/client.go b/internal/api/client.go
index 5ff51cc..f0e8187 100644
--- a/internal/api/client.go
+++ b/internal/api/client.go
@@ -333,10 +333,14 @@ func (c *Client) ListClients(ctx context.Context) ([]ProvisionedClient, error) {
 		if status < 200 || status >= 300 {
 			return nil, &APIError{StatusCode: status, Body: string(raw), URL: reqURL}
 		}
-		// Unpaginated deployment → a bare array; return it as-is.
-		var bare []ProvisionedClient
-		if err := json.Unmarshal(raw, &bare); err == nil {
-			return append(all, bare...), nil
+		// Unpaginated deployment → a bare array. Only valid as the sole response
+		// (a paginated chain is a `{next,results}` object on every page), so guard
+		// to page 0 — a stray bare body mid-chain must not silently end the loop.
+		if pageNum == 0 {
+			var bare []ProvisionedClient
+			if err := json.Unmarshal(raw, &bare); err == nil {
+				return bare, nil
+			}
 		}
 		var body struct {
 			Next    string              `json:"next"`
diff --git a/internal/api/client_test.go b/internal/api/client_test.go
index 8cad599..e091b79 100644
--- a/internal/api/client_test.go
+++ b/internal/api/client_test.go
@@ -234,3 +234,22 @@ func TestListClients_UnparseableNextLink_IsError(t *testing.T) {
 		t.Fatal("expected an error on an unparseable next link, got nil (silent truncation)")
 	}
 }
+
+// TestListClients_BareArrayUnpaginated covers the unpaginated deployment shape
+// (a bare JSON array) — still returned as-is after the bare-decode was guarded
+// to the first page.
+func TestListClients_BareArrayUnpaginated(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":1,"first_name":"a","namespace":"a"},{"id":2,"first_name":"b","namespace":"b"}]`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	got, err := c.ListClients(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 {
+		t.Fatalf("want 2 clients [1,2] from a bare array, got %+v", got)
+	}
+}
diff --git a/internal/cli/auth.go b/internal/cli/auth.go
index 9819d29..9176586 100644
--- a/internal/cli/auth.go
+++ b/internal/cli/auth.go
@@ -123,7 +123,9 @@ func runLogin(ctx context.Context, p *ui.Printer, envFlag string) error {
 		case errors.Is(err, api.ErrAuthorizationPending):
 			// not approved yet — keep polling
 		case errors.Is(err, api.ErrSlowDown):
-			interval++
+			// RFC 8628 §3.5: on slow_down the client MUST increase the poll
+			// interval by 5 seconds for this and all subsequent polls.
+			interval += 5
 		case errors.Is(err, api.ErrExpiredToken):
 			return &exitError{code: 1, err: errors.New("the sign-in code expired — re-run `tracebloc login`")}
 		case errors.Is(err, api.ErrAccessDenied):
diff --git a/internal/cli/auth_test.go b/internal/cli/auth_test.go
index d6b2460..d17ce6e 100644
--- a/internal/cli/auth_test.go
+++ b/internal/cli/auth_test.go
@@ -87,6 +87,55 @@ func TestLogin_FullFlow(t *testing.T) {
 	}
 }
 
+// TestLogin_SlowDownBacksOffByFive pins RFC 8628 §3.5: on `slow_down` the poll
+// interval must increase by 5 seconds, not 1. Captures the durations handed to
+// the pollAfter seam.
+func TestLogin_SlowDownBacksOffByFive(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	var polls int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+		case "/device/token":
+			polls++
+			if polls == 1 {
+				w.WriteHeader(http.StatusBadRequest)
+				_, _ = w.Write([]byte(`{"error":"slow_down"}`))
+				return
+			}
+			_, _ = w.Write([]byte(`{"token":"cat_ok"}`))
+		case "/userinfo/":
+			_, _ = w.Write([]byte(`{"email":"e@co","account":"A"}`))
+		}
+	}))
+	t.Cleanup(srv.Close)
+
+	origClient, origAfter := newAPIClient, pollAfter
+	newAPIClient = func(string) *api.Client { return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()} }
+	var waits []time.Duration
+	pollAfter = func(d time.Duration) <-chan time.Time {
+		waits = append(waits, d)
+		ch := make(chan time.Time, 1)
+		ch <- time.Time{}
+		return ch
+	}
+	t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter })
+
+	if _, err := runCmd(t, "login"); err != nil {
+		t.Fatalf("login: %v", err)
+	}
+	if len(waits) < 2 {
+		t.Fatalf("expected >=2 polls, got waits=%v", waits)
+	}
+	if waits[0] != 5*time.Second {
+		t.Errorf("first poll wait = %v, want 5s (server interval)", waits[0])
+	}
+	if waits[1] != 10*time.Second {
+		t.Errorf("post-slow_down wait = %v, want 10s (interval+5 per RFC 8628), not 6s", waits[1])
+	}
+}
+
 func TestLogin_BackendUnsupported(t *testing.T) {
 	withTestBackend(t, func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusNotFound)
diff --git a/internal/cli/client.go b/internal/cli/client.go
index c4d06a3..328bbc3 100644
--- a/internal/cli/client.go
+++ b/internal/cli/client.go
@@ -167,6 +167,13 @@ func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clien
 	var existing []string
 	if clients, lerr := client.ListClients(ctx); lerr == nil {
 		for _, c := range clients {
+			// Skip the client already anchored to this cluster: a re-run adopts it
+			// (the backend keys on cluster_id), so its namespace isn't a collision.
+			// Counting it would bump the derived slug and show a namespace in the
+			// review that doesn't match the one actually adopted.
+			if clusterID != "" && c.ClusterID == clusterID {
+				continue
+			}
 			if c.Namespace != "" {
 				existing = append(existing, c.Namespace)
 			}
diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go
index bd79021..dcdcc1c 100644
--- a/internal/cli/client_test.go
+++ b/internal/cli/client_test.go
@@ -521,3 +521,31 @@ func TestClientCreate_NoClusterAnchorWarns(t *testing.T) {
 		t.Errorf("no-anchor fallback should still print the credential, got:\n%s", out.String())
 	}
 }
+
+// TestClientCreate_ReRunReviewShowsAdoptedNamespace pins that on an idempotent
+// re-run, the client already anchored to this cluster is excluded from collision
+// detection — so the review shows the namespace that's actually adopted
+// (lab-one), not a bumped lab-one-2.
+func TestClientCreate_ReRunReviewShowsAdoptedNamespace(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			// An existing client already anchored to THIS cluster (uid-1).
+			_, _ = w.Write([]byte(`[{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}]`))
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			w.WriteHeader(http.StatusOK) // adopt
+			_, _ = w.Write([]byte(`{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	confirmYes := true
+	pr := &fakePrompter{answers: map[string]string{}, confirm: &confirmYes}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr,
+		clientCreateOpts{name: "Lab One", location: "DE"}); err != nil {
+		t.Fatalf("re-run create: %v", err)
+	}
+	if strings.Contains(out.String(), "lab-one-2") {
+		t.Errorf("review showed a bumped namespace — the cluster's own client wasn't excluded from collision detection:\n%s", out.String())
+	}
+}
diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go
index fdcc4ec..d69ce9c 100644
--- a/internal/doctor/doctor.go
+++ b/internal/doctor/doctor.go
@@ -288,7 +288,10 @@ func checkBackendEgress(ctx context.Context, env map[string]string, probe func(c
 // runtime's own mapping (controller.py). Unset/unknown defaults to prod, the
 // chart's CLIENT_ENV default.
 func backendHost(clientEnv string) string {
-	switch clientEnv {
+	// Normalize like the API client (api.ResolveEnv/BaseURL lower-case), so a
+	// non-lowercase CLIENT_ENV on the edge box doesn't fall through to prod and
+	// make the doctor probe the wrong backend.
+	switch strings.ToLower(strings.TrimSpace(clientEnv)) {
 	case "dev":
 		return "dev-api.tracebloc.io"
 	case "stg":
diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go
index 766fe93..7eaaa22 100644
--- a/internal/doctor/doctor_test.go
+++ b/internal/doctor/doctor_test.go
@@ -232,6 +232,11 @@ func TestBackendHost(t *testing.T) {
 		"prod":  "api.tracebloc.io",
 		"":      "api.tracebloc.io",
 		"weird": "api.tracebloc.io",
+		// Case/space-insensitive, matching the API client's env resolution — a
+		// non-lowercase CLIENT_ENV must not fall through to prod.
+		"DEV":   "dev-api.tracebloc.io",
+		"Stg":   "stg-api.tracebloc.io",
+		" dev ": "dev-api.tracebloc.io",
 	}
 	for in, want := range tests {
 		if got := backendHost(in); got != want {