diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d1c8dba..d927706 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -14,8 +14,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: build-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   schema-drift:
+    timeout-minutes: 10
     name: Schema drift check
     # Verifies the embedded internal/schema/ingest.v1.json matches
     # tracebloc/data-ingestors' master. A green PR that silently
@@ -30,6 +35,7 @@ jobs:
         run: ./scripts/sync-schema.sh --check
 
   test:
+    timeout-minutes: 15
     name: Test
     runs-on: ubuntu-latest
     steps:
@@ -52,6 +58,7 @@ jobs:
         run: go test -race -cover ./...
 
   lint:
+    timeout-minutes: 10
     name: Lint
     runs-on: ubuntu-latest
     steps:
@@ -103,6 +110,7 @@ jobs:
           misspell -error .
 
   build:
+    timeout-minutes: 20
     name: Build (${{ matrix.os }}/${{ matrix.arch }})
     runs-on: ubuntu-latest
     strategy:
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index 660921c..10a0beb 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -21,8 +21,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: e2e-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   integration:
+    timeout-minutes: 30
     name: Integration (kind)
     runs-on: ubuntu-latest
     # Skip on PRs that aren't explicitly opted in via the `e2e` label;
diff --git a/.github/workflows/public-pii-gate-caller.yml b/.github/workflows/public-pii-gate-caller.yml
new file mode 100644
index 0000000..4af6016
--- /dev/null
+++ b/.github/workflows/public-pii-gate-caller.yml
@@ -0,0 +1,14 @@
+name: Public PII gate
+
+# Per-repo caller for the public-repo PII gate. Blocks PRs whose title/body/
+# commits contain a denylisted customer/partner name or known secret.
+# Logic lives in tracebloc/.github/.github/workflows/public-pii-gate.yml.
+
+on:
+  pull_request:
+    types: [opened, edited, reopened, synchronize, labeled, unlabeled]
+
+jobs:
+  pii-gate:
+    uses: tracebloc/.github/.github/workflows/public-pii-gate.yml@main
+    secrets: inherit
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1468f1c..0a8f889 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -44,8 +44,13 @@ permissions:
   contents: write  # create / update the GitHub Release
   id-token: write  # cosign keyless OIDC
 
+concurrency:
+  group: release-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   release:
+    timeout-minutes: 20
     name: Build + sign + publish
     runs-on: ubuntu-latest
     strategy:
@@ -158,6 +163,7 @@ jobs:
           if-no-files-found: error
 
   publish:
+    timeout-minutes: 20
     name: Aggregate + create GitHub Release
     runs-on: ubuntu-latest
     needs: release
@@ -220,6 +226,7 @@ jobs:
   # add a HOMEBREW_TAP_TOKEN repo secret with write access to
   # tracebloc/homebrew-tap) when that repo exists.
   bump-homebrew-tap:
+    timeout-minutes: 20
     name: Bump Homebrew tap formula
     runs-on: ubuntu-latest
     needs: publish
diff --git a/internal/api/client.go b/internal/api/client.go
new file mode 100644
index 0000000..f0e8187
--- /dev/null
+++ b/internal/api/client.go
@@ -0,0 +1,397 @@
+// Package api is the tracebloc CLI's client for the central backend REST API
+// (browser login + client provisioning). It is distinct from internal/submit,
+// which talks to the in-cluster jobs-manager: this one reaches the public
+// backend over real TLS. RFC-0001 (backend#830); the device-flow endpoints
+// (/device/code, /device/token) land in backend#835, so RequestDeviceCode /
+// PollToken are written against the RFC 8628 spec and go live when that ships.
+package api
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"time"
+)
+
+// Backend environments (mirror CLIENT_ENV).
+const (
+	EnvDev  = "dev"
+	EnvStg  = "stg"
+	EnvProd = "prod"
+)
+
+const defaultTimeout = 30 * time.Second
+
+// BaseURL maps a CLIENT_ENV value to the backend base URL — kept in lock-step
+// with the installer's `_backend_url` and client-runtime's CLIENT_ENV→backend
+// mapping. Unknown / empty → prod.
+func BaseURL(env string) string {
+	switch strings.ToLower(env) {
+	case EnvDev:
+		return "https://dev-api.tracebloc.io"
+	case EnvStg:
+		return "https://stg-api.tracebloc.io"
+	default:
+		return "https://api.tracebloc.io"
+	}
+}
+
+// ResolveEnv picks the backend env: an explicit value (a --env flag) wins,
+// then $CLIENT_ENV, then prod.
+func ResolveEnv(explicit string) string {
+	if explicit != "" {
+		return strings.ToLower(explicit)
+	}
+	if e := os.Getenv("CLIENT_ENV"); e != "" {
+		return strings.ToLower(e)
+	}
+	return EnvProd
+}
+
+// Client talks to the backend REST API. Token (the user token from login) is
+// optional: the device-flow endpoints are unauthenticated; provisioning calls
+// set it.
+type Client struct {
+	BaseURL string
+	Token   string
+	HTTP    *http.Client
+}
+
+// New returns a Client for the given env. The HTTP client is proxy- and
+// CA-aware — it honors HTTP(S)_PROXY/NO_PROXY and the system cert pool —
+// because RFC-0001 must work behind a corporate / TLS-inspecting proxy
+// (backend#830 Q1). Unlike internal/submit (in-cluster, no real TLS) this
+// verifies certificates: it's the public backend.
+func New(env string) *Client {
+	return &Client{
+		BaseURL: strings.TrimRight(BaseURL(env), "/"),
+		HTTP: &http.Client{
+			Timeout:   defaultTimeout,
+			Transport: &http.Transport{Proxy: http.ProxyFromEnvironment},
+		},
+	}
+}
+
+// APIError is a non-2xx response, with the remote body surfaced verbatim.
+type APIError struct {
+	StatusCode int
+	Body       string
+	URL        string
+}
+
+func (e *APIError) Error() string {
+	return fmt.Sprintf("%s returned HTTP %d: %s", e.URL, e.StatusCode, strings.TrimSpace(e.Body))
+}
+
+// post sends an optional JSON body and returns the status code + raw response.
+func (c *Client) post(ctx context.Context, path string, body any) (int, []byte, error) {
+	var rdr io.Reader
+	if body != nil {
+		b, err := json.Marshal(body)
+		if err != nil {
+			return 0, nil, fmt.Errorf("encoding request: %w", err)
+		}
+		rdr = bytes.NewReader(b)
+	}
+	url := c.BaseURL + path
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, rdr)
+	if err != nil {
+		return 0, nil, fmt.Errorf("building request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	c.setAuth(req)
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return 0, nil, fmt.Errorf("POST %s: %w", url, err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+	raw, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err)
+	}
+	return resp.StatusCode, raw, nil
+}
+
+// setAuth attaches the stored token as a Bearer credential. The login token is
+// a ClientAccessToken, authenticated by the backend's
+// ClientAccessTokenAuthentication (keyword "Bearer", backend#835) — NOT the
+// legacy DRF "Token" scheme.
+func (c *Client) setAuth(req *http.Request) {
+	if c.Token != "" {
+		req.Header.Set("Authorization", "Bearer "+c.Token)
+	}
+}
+
+// get sends an authenticated GET and returns the status code + raw response.
+func (c *Client) get(ctx context.Context, path string) (int, []byte, error) {
+	url := c.BaseURL + path
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return 0, nil, fmt.Errorf("building request: %w", err)
+	}
+	c.setAuth(req)
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return 0, nil, fmt.Errorf("GET %s: %w", url, err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+	raw, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err)
+	}
+	return resp.StatusCode, raw, nil
+}
+
+// ── Device Authorization Grant (RFC 8628) — backend endpoints land in #835 ──
+
+// DeviceCodeResponse is the reply from POST /device/code.
+type DeviceCodeResponse struct {
+	DeviceCode              string `json:"device_code"`
+	UserCode                string `json:"user_code"`
+	VerificationURI         string `json:"verification_uri"`
+	VerificationURIComplete string `json:"verification_uri_complete"`
+	ExpiresIn               int    `json:"expires_in"`
+	Interval                int    `json:"interval"`
+}
+
+// RequestDeviceCode starts the device flow.
+func (c *Client) RequestDeviceCode(ctx context.Context) (*DeviceCodeResponse, error) {
+	url := c.BaseURL + "/device/code"
+	status, raw, err := c.post(ctx, "/device/code", nil)
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out DeviceCodeResponse
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, fmt.Errorf("decoding device-code response: %w", err)
+	}
+	if out.DeviceCode == "" || out.UserCode == "" {
+		return nil, fmt.Errorf("device-code response missing device_code/user_code (got %q)", string(raw))
+	}
+	return &out, nil
+}
+
+// Device-flow poll outcomes (RFC 8628 §3.5): pending / slow_down mean "keep
+// polling"; expired_token / access_denied are terminal.
+var (
+	ErrAuthorizationPending = errors.New("authorization_pending")
+	ErrSlowDown             = errors.New("slow_down")
+	ErrExpiredToken         = errors.New("expired_token")
+	ErrAccessDenied         = errors.New("access_denied")
+)
+
+// PollToken polls POST /device/token once. It returns the user token on
+// approval, or one of the Err* sentinels (pending/slow_down → keep polling,
+// expired/denied → stop), or an *APIError for anything else.
+func (c *Client) PollToken(ctx context.Context, deviceCode string) (string, error) {
+	url := c.BaseURL + "/device/token"
+	status, raw, err := c.post(ctx, "/device/token", map[string]string{"device_code": deviceCode})
+	if err != nil {
+		return "", err
+	}
+	var body struct {
+		Token string `json:"token"`
+		Error string `json:"error"`
+	}
+	_ = json.Unmarshal(raw, &body) // best-effort; the status + error field drive the result
+	if status >= 200 && status < 300 {
+		if body.Token == "" {
+			return "", fmt.Errorf("device-token success response missing token (got %q)", string(raw))
+		}
+		return body.Token, nil
+	}
+	switch body.Error {
+	case "authorization_pending":
+		return "", ErrAuthorizationPending
+	case "slow_down":
+		return "", ErrSlowDown
+	case "expired_token":
+		return "", ErrExpiredToken
+	case "access_denied":
+		return "", ErrAccessDenied
+	}
+	return "", &APIError{StatusCode: status, Body: string(raw), URL: url}
+}
+
+// ── Authenticated calls (Bearer ClientAccessToken) ──
+
+// Identity is the signed-in user, from GET /userinfo/.
+type Identity struct {
+	Email   string `json:"email"`
+	Type    string `json:"type"`
+	Account string `json:"account"`
+}
+
+// WhoAmI fetches the signed-in user from the backend, authenticating with the
+// stored token (Bearer). It confirms the token is live and returns the account
+// — `login` uses it to verify the credential it just obtained. Requires Token.
+func (c *Client) WhoAmI(ctx context.Context) (*Identity, error) {
+	url := c.BaseURL + "/userinfo/"
+	status, raw, err := c.get(ctx, "/userinfo/")
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var id Identity
+	if err := json.Unmarshal(raw, &id); err != nil {
+		return nil, fmt.Errorf("decoding userinfo response: %w", err)
+	}
+	return &id, nil
+}
+
+// ── Client provisioning (Bearer-authed) — backend#836, /edge-device/ ──
+
+// ProvisionedClient is a tracebloc client (machine), as returned by the
+// EdgeDevice endpoints.
+type ProvisionedClient struct {
+	ID        int    `json:"id"`
+	Name      string `json:"first_name"`
+	Username  string `json:"username"`
+	Namespace string `json:"namespace"`
+	Location  string `json:"location"`
+	Status    int    `json:"status"`
+	// ClusterID is the kube-system namespace UID this client is anchored to
+	// (RFC-0001 §6.3 / backend#883). Empty on legacy / not-yet-backfilled clients.
+	ClusterID string `json:"cluster_id"`
+}
+
+// CreateClientRequest is the POST /edge-device/ body. The account is stamped
+// server-side from the token; password is the machine credential the caller
+// generates (write-only on the backend).
+type CreateClientRequest struct {
+	Name      string `json:"first_name"`
+	Namespace string `json:"namespace"`
+	Location  string `json:"location"`
+	Password  string `json:"password"`
+	// ClusterID anchors the client to this cluster (the kube-system namespace UID)
+	// so create is get-or-create keyed on it (RFC-0001 §7.2 / backend#883). Omitted
+	// when the cluster identity can't be read (dual-mode / legacy → plain mint).
+	ClusterID string `json:"cluster_id,omitempty"`
+}
+
+// AdminContact is one "ask an admin" entry from GET /edge-device/admins/.
+type AdminContact struct {
+	Name  string `json:"name"`
+	Email string `json:"email"`
+}
+
+// CreateClient provisions a client — get-or-create keyed on cluster_id when one
+// is supplied (RFC-0001 §7.2 / backend#883). The returned `adopted` is true when
+// the backend matched an existing client for this cluster (HTTP 200, an idempotent
+// re-run) and false when it minted a new one (HTTP 201). A 403 *APIError means the
+// caller lacks CLIENT_WRITE (→ ask-an-admin, backend#836 Q4); a 409 *APIError means
+// the cluster is bound to another account (cluster_conflict, R6).
+func (c *Client) CreateClient(ctx context.Context, req CreateClientRequest) (pc *ProvisionedClient, adopted bool, err error) {
+	url := c.BaseURL + "/edge-device/"
+	status, raw, err := c.post(ctx, "/edge-device/", req)
+	if err != nil {
+		return nil, false, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, false, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out ProvisionedClient
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, false, fmt.Errorf("decoding create-client response: %w", err)
+	}
+	// 200 = adopted an existing client for this cluster_id; 201 = freshly minted.
+	return &out, status == http.StatusOK, nil
+}
+
+// maxListPages bounds how many pages ListClients will follow — a backstop
+// against a misbehaving `next` chain, set well above any real account.
+const maxListPages = 100
+
+// ListClients returns ALL clients in the caller's account (GET /edge-device/).
+// The endpoint is DRF-paginated, so this follows `next` to the end — list,
+// `use <id>`, and create-time collision detection must see every client, not
+// just the first page. Also tolerates a bare (unpaginated) list body.
+func (c *Client) ListClients(ctx context.Context) ([]ProvisionedClient, error) {
+	var all []ProvisionedClient
+	path := "/edge-device/"
+	for pageNum := 0; path != ""; pageNum++ {
+		if pageNum >= maxListPages {
+			return nil, fmt.Errorf("client list exceeded %d pages — aborting", maxListPages)
+		}
+		reqURL := c.BaseURL + path
+		status, raw, err := c.get(ctx, path)
+		if err != nil {
+			return nil, err
+		}
+		if status < 200 || status >= 300 {
+			return nil, &APIError{StatusCode: status, Body: string(raw), URL: reqURL}
+		}
+		// Unpaginated deployment → a bare array. Only valid as the sole response
+		// (a paginated chain is a `{next,results}` object on every page), so guard
+		// to page 0 — a stray bare body mid-chain must not silently end the loop.
+		if pageNum == 0 {
+			var bare []ProvisionedClient
+			if err := json.Unmarshal(raw, &bare); err == nil {
+				return bare, nil
+			}
+		}
+		var body struct {
+			Next    string              `json:"next"`
+			Results []ProvisionedClient `json:"results"`
+		}
+		if err := json.Unmarshal(raw, &body); err != nil {
+			return nil, fmt.Errorf("decoding client list: %w", err)
+		}
+		all = append(all, body.Results...)
+		next, perr := nextPath(body.Next)
+		if perr != nil {
+			return nil, perr
+		}
+		path = next
+	}
+	return all, nil
+}
+
+// nextPath reduces a DRF `next` link (an absolute URL) to the path+query this
+// client appends to BaseURL. An empty link returns ("", nil) — the normal
+// end of pages. A non-empty link that won't parse is an error, NOT a silent
+// "", so the loop never quietly stops mid-list and returns only the pages
+// seen so far (list / `use` / collision checks must see every client).
+func nextPath(next string) (string, error) {
+	if next == "" {
+		return "", nil
+	}
+	u, err := url.Parse(next)
+	if err != nil {
+		return "", fmt.Errorf("client list: unparseable pagination link %q: %w", next, err)
+	}
+	if u.RawQuery != "" {
+		return u.Path + "?" + u.RawQuery, nil
+	}
+	return u.Path, nil
+}
+
+// ListClientAdmins returns who in the account can provision (the ask-an-admin
+// path), from GET /edge-device/admins/ (backend#836 Q4).
+func (c *Client) ListClientAdmins(ctx context.Context) ([]AdminContact, error) {
+	url := c.BaseURL + "/edge-device/admins/"
+	status, raw, err := c.get(ctx, "/edge-device/admins/")
+	if err != nil {
+		return nil, err
+	}
+	if status < 200 || status >= 300 {
+		return nil, &APIError{StatusCode: status, Body: string(raw), URL: url}
+	}
+	var out []AdminContact
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, fmt.Errorf("decoding admins response: %w", err)
+	}
+	return out, nil
+}
diff --git a/internal/api/client_test.go b/internal/api/client_test.go
new file mode 100644
index 0000000..e091b79
--- /dev/null
+++ b/internal/api/client_test.go
@@ -0,0 +1,255 @@
+package api
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestBaseURL(t *testing.T) {
+	cases := map[string]string{
+		"dev":   "https://dev-api.tracebloc.io",
+		"stg":   "https://stg-api.tracebloc.io",
+		"prod":  "https://api.tracebloc.io",
+		"":      "https://api.tracebloc.io",
+		"DEV":   "https://dev-api.tracebloc.io", // case-insensitive
+		"weird": "https://api.tracebloc.io",     // unknown -> prod
+	}
+	for env, want := range cases {
+		if got := BaseURL(env); got != want {
+			t.Errorf("BaseURL(%q) = %q, want %q", env, got, want)
+		}
+	}
+}
+
+func TestResolveEnv(t *testing.T) {
+	t.Setenv("CLIENT_ENV", "stg")
+	if got := ResolveEnv("dev"); got != "dev" {
+		t.Errorf("explicit should win: got %q", got)
+	}
+	if got := ResolveEnv(""); got != "stg" {
+		t.Errorf("CLIENT_ENV should be used: got %q", got)
+	}
+	t.Setenv("CLIENT_ENV", "")
+	if got := ResolveEnv(""); got != "prod" {
+		t.Errorf("default should be prod: got %q", got)
+	}
+}
+
+func TestRequestDeviceCode(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/device/code" || r.Method != http.MethodPost {
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+		_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	resp, err := c.RequestDeviceCode(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if resp.UserCode != "WDJB-MJHT" || resp.Interval != 5 || resp.DeviceCode != "dc" {
+		t.Errorf("got %+v", resp)
+	}
+}
+
+func TestPollTokenSequence(t *testing.T) {
+	var n int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		n++
+		switch n {
+		case 1:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"authorization_pending"}`))
+		case 2:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"slow_down"}`))
+		default:
+			_, _ = w.Write([]byte(`{"token":"usertoken123"}`))
+		}
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	ctx := context.Background()
+	if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrAuthorizationPending) {
+		t.Errorf("poll 1: want pending, got %v", err)
+	}
+	if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrSlowDown) {
+		t.Errorf("poll 2: want slow_down, got %v", err)
+	}
+	tok, err := c.PollToken(ctx, "dc")
+	if err != nil || tok != "usertoken123" {
+		t.Errorf("poll 3: want token, got %q / %v", tok, err)
+	}
+}
+
+func TestPollTokenDenied(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusBadRequest)
+		_, _ = w.Write([]byte(`{"error":"access_denied"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	if _, err := c.PollToken(context.Background(), "dc"); !errors.Is(err, ErrAccessDenied) {
+		t.Errorf("want access_denied, got %v", err)
+	}
+}
+
+func TestWhoAmI(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/userinfo/" || r.Method != http.MethodGet {
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+		if got := r.Header.Get("Authorization"); got != "Bearer usertoken123" {
+			t.Errorf("auth header = %q, want %q", got, "Bearer usertoken123")
+		}
+		_, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","type":"DS","account":"Acme"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	c.Token = "usertoken123"
+	id, err := c.WhoAmI(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if id.Email != "ds@tracebloc.io" || id.Account != "Acme" {
+		t.Errorf("got %+v", id)
+	}
+}
+
+func TestWhoAmIUnauthorized(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusUnauthorized)
+		_, _ = w.Write([]byte(`{"detail":"Invalid token."}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	c.Token = "bad"
+	var ae *APIError
+	if _, err := c.WhoAmI(context.Background()); !errors.As(err, &ae) || ae.StatusCode != http.StatusUnauthorized {
+		t.Errorf("want APIError 401, got %v", err)
+	}
+}
+
+func TestCreateClientMintAndAdopt(t *testing.T) {
+	for _, tc := range []struct {
+		name        string
+		code        int
+		wantAdopted bool
+	}{
+		{"mint", http.StatusCreated, false},
+		{"adopt", http.StatusOK, true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			var sent CreateClientRequest
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.URL.Path != "/edge-device/" || r.Method != http.MethodPost {
+					t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+				}
+				_ = json.NewDecoder(r.Body).Decode(&sent)
+				w.WriteHeader(tc.code)
+				_, _ = w.Write([]byte(`{"id":5,"first_name":"c","namespace":"c","cluster_id":"uid-1"}`))
+			}))
+			defer srv.Close()
+			c := New("prod")
+			c.BaseURL = srv.URL
+			pc, adopted, err := c.CreateClient(context.Background(),
+				CreateClientRequest{Name: "c", Namespace: "c", Password: "pw", ClusterID: "uid-1"})
+			if err != nil {
+				t.Fatal(err)
+			}
+			if adopted != tc.wantAdopted {
+				t.Errorf("adopted = %v, want %v", adopted, tc.wantAdopted)
+			}
+			if sent.ClusterID != "uid-1" {
+				t.Errorf("cluster_id sent = %q, want uid-1", sent.ClusterID)
+			}
+			if pc.ClusterID != "uid-1" {
+				t.Errorf("cluster_id parsed = %q, want uid-1", pc.ClusterID)
+			}
+		})
+	}
+}
+
+func TestCreateClientConflict(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusConflict)
+		_, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	var ae *APIError
+	_, _, err := c.CreateClient(context.Background(), CreateClientRequest{ClusterID: "uid-1"})
+	if !errors.As(err, &ae) || ae.StatusCode != http.StatusConflict {
+		t.Errorf("want APIError 409, got %v", err)
+	}
+}
+
+// TestListClients_FollowsPagination guards that DRF pagination is still
+// followed end-to-end after the nextPath refactor (page 1 → page 2 → done).
+func TestListClients_FollowsPagination(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Query().Get("page") == "2" {
+			_, _ = w.Write([]byte(`{"next":"","results":[{"id":2,"first_name":"b","namespace":"b"}]}`))
+			return
+		}
+		// DRF emits an absolute `next`; nextPath keeps only path+query.
+		_, _ = w.Write([]byte(`{"next":"http://x/edge-device/?page=2","results":[{"id":1,"first_name":"a","namespace":"a"}]}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	got, err := c.ListClients(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 {
+		t.Fatalf("want 2 clients [1,2], got %+v", got)
+	}
+}
+
+// TestListClients_UnparseableNextLink_IsError pins the Bugbot fix (v0.4.0 RC):
+// a non-empty `next` the server sends that url.Parse rejects must be a hard
+// error, never a silent truncation to the pages seen so far — otherwise list /
+// `use` / namespace-collision checks would miss clients without any error.
+func TestListClients_UnparseableNextLink_IsError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		// `next` carries a control byte (\u007f) → url.Parse fails.
+		_, _ = w.Write([]byte(`{"next":"http://x/\u007f","results":[{"id":1,"first_name":"a","namespace":"a"}]}`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	if _, err := c.ListClients(context.Background()); err == nil {
+		t.Fatal("expected an error on an unparseable next link, got nil (silent truncation)")
+	}
+}
+
+// TestListClients_BareArrayUnpaginated covers the unpaginated deployment shape
+// (a bare JSON array) — still returned as-is after the bare-decode was guarded
+// to the first page.
+func TestListClients_BareArrayUnpaginated(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":1,"first_name":"a","namespace":"a"},{"id":2,"first_name":"b","namespace":"b"}]`))
+	}))
+	defer srv.Close()
+	c := New("prod")
+	c.BaseURL = srv.URL
+	got, err := c.ListClients(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 {
+		t.Fatalf("want 2 clients [1,2] from a bare array, got %+v", got)
+	}
+}
diff --git a/internal/cli/auth.go b/internal/cli/auth.go
new file mode 100644
index 0000000..9176586
--- /dev/null
+++ b/internal/cli/auth.go
@@ -0,0 +1,212 @@
+package cli
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// newLoginCmd implements `tracebloc login` — browser sign-in via the OAuth 2.0
+// Device Authorization Grant (RFC 8628). Works on a headless box: the CLI shows
+// a URL + short code, the human approves in a browser on any device, and the
+// CLI polls until a user token is issued and stored in ~/.tracebloc (0600).
+// The backend endpoints land in backend#835; until then login reports that the
+// backend doesn't support browser sign-in yet.
+func newLoginCmd() *cobra.Command {
+	var envFlag string
+	cmd := &cobra.Command{
+		Use:   "login",
+		Short: "Sign in to tracebloc in your browser (device flow)",
+		Long: `Sign in to tracebloc. The CLI prints a URL + short code; open the URL
+on any device (your laptop or phone), sign in the way you already do
+(password, Google, or GitHub), and approve the code. The CLI stores a
+user token in ~/.tracebloc (mode 0600).
+
+Works on a headless / SSH box — the browser and the CLI need not share a
+machine. Honors HTTP(S)_PROXY / NO_PROXY for corporate-proxy networks.`,
+		Args: cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runLogin(cmd.Context(), printerFor(cmd), envFlag)
+		},
+	}
+	cmd.Flags().StringVar(&envFlag, "env", "",
+		"backend environment: dev|stg|prod (default: $CLIENT_ENV, then prod)")
+	return cmd
+}
+
+// Test seams: the device flow makes real HTTP calls on a timer, so tests
+// override the client factory (point it at an httptest server) and the poll
+// clock (fire immediately) rather than hitting the network / wall clock.
+var (
+	newAPIClient = api.New
+	pollAfter    = time.After
+)
+
+func runLogin(ctx context.Context, p *ui.Printer, envFlag string) error {
+	cfg, err := config.Load()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	env := api.ResolveEnv(envFlag)
+	client := newAPIClient(env)
+
+	dc, err := client.RequestDeviceCode(ctx)
+	if err != nil {
+		var ae *api.APIError
+		if errors.As(err, &ae) && ae.StatusCode == http.StatusNotFound {
+			return &exitError{code: 1, err: fmt.Errorf(
+				"this backend (%s) doesn't support browser login yet — the device-grant "+
+					"endpoints land in backend#835: %w", env, err)}
+		}
+		return &exitError{code: 1, err: err}
+	}
+
+	p.Section("Sign in to tracebloc")
+	uri := dc.VerificationURIComplete
+	if uri == "" {
+		uri = dc.VerificationURI
+	}
+	p.Field("open", uri)
+	p.Field("code", dc.UserCode)
+	p.Newline()
+	p.Hintf("Waiting for you to approve in the browser… (Ctrl-C to cancel)")
+
+	interval := dc.Interval
+	if interval <= 0 {
+		interval = 5
+	}
+	var deadline time.Time
+	if dc.ExpiresIn > 0 {
+		deadline = time.Now().Add(time.Duration(dc.ExpiresIn) * time.Second)
+	}
+
+	for {
+		if !deadline.IsZero() && time.Now().After(deadline) {
+			return &exitError{code: 1, err: errors.New("login timed out — re-run `tracebloc login`")}
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-pollAfter(time.Duration(interval) * time.Second):
+		}
+
+		tok, err := client.PollToken(ctx, dc.DeviceCode)
+		switch {
+		case err == nil:
+			cfg.Env = env
+			cfg.Token = tok
+			// Confirm the freshly-issued token actually authenticates, and
+			// capture the account to show + store. Best-effort: don't fail a
+			// successful sign-in just because this lookup couldn't run.
+			client.Token = tok
+			if id, werr := client.WhoAmI(ctx); werr == nil {
+				cfg.Email = id.Email
+			}
+			if err := cfg.Save(); err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			p.Newline()
+			if cfg.Email != "" {
+				p.Successf("Signed in as %s. Token saved to ~/.tracebloc (0600).", cfg.Email)
+			} else {
+				p.Successf("Signed in. Token saved to ~/.tracebloc (0600).")
+			}
+			return nil
+		case errors.Is(err, api.ErrAuthorizationPending):
+			// not approved yet — keep polling
+		case errors.Is(err, api.ErrSlowDown):
+			// RFC 8628 §3.5: on slow_down the client MUST increase the poll
+			// interval by 5 seconds for this and all subsequent polls.
+			interval += 5
+		case errors.Is(err, api.ErrExpiredToken):
+			return &exitError{code: 1, err: errors.New("the sign-in code expired — re-run `tracebloc login`")}
+		case errors.Is(err, api.ErrAccessDenied):
+			return &exitError{code: 1, err: errors.New("sign-in was denied in the browser")}
+		default:
+			return &exitError{code: 1, err: err}
+		}
+	}
+}
+
+// newLogoutCmd implements `tracebloc logout` — clears the stored token.
+func newLogoutCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "logout",
+		Short: "Sign out (clear the stored token)",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			cfg, err := config.Load()
+			if err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			if !cfg.SignedIn() {
+				printerFor(cmd).Hintf("Already signed out.")
+				return nil
+			}
+			cfg.Token = ""
+			cfg.Email = ""
+			// Also drop the active-client pointer: it's account-scoped, so leaving
+			// it would bleed into the next account's session (a later `login` as a
+			// different user would inherit a stale active client in `auth status` /
+			// `client list`).
+			cfg.ActiveClientID = ""
+			if err := cfg.Save(); err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			printerFor(cmd).Successf("Signed out.")
+			return nil
+		},
+	}
+}
+
+// newAuthCmd is the `tracebloc auth` parent; today it carries `auth status`.
+func newAuthCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "auth",
+		Short: "Inspect tracebloc authentication state",
+	}
+	cmd.AddCommand(newAuthStatusCmd())
+	return cmd
+}
+
+// newAuthStatusCmd implements `tracebloc auth status`.
+func newAuthStatusCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "status",
+		Short: "Show whether you're signed in, and to which backend",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			cfg, err := config.Load()
+			if err != nil {
+				return &exitError{code: 1, err: err}
+			}
+			p := printerFor(cmd)
+			if !cfg.SignedIn() {
+				p.Hintf("Not signed in. Run `tracebloc login`.")
+				return nil
+			}
+			env := cfg.Env
+			if env == "" {
+				env = api.EnvProd
+			}
+			p.Section("tracebloc auth")
+			p.Field("status", "signed in")
+			p.Field("backend", env)
+			if cfg.Email != "" {
+				p.Field("account", cfg.Email)
+			}
+			if cfg.ActiveClientID != "" {
+				p.Field("active client", cfg.ActiveClientID)
+			}
+			return nil
+		},
+	}
+}
diff --git a/internal/cli/auth_test.go b/internal/cli/auth_test.go
new file mode 100644
index 0000000..d17ce6e
--- /dev/null
+++ b/internal/cli/auth_test.go
@@ -0,0 +1,217 @@
+package cli
+
+import (
+	"bytes"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/config"
+)
+
+// withTestBackend points the login command at an httptest server (via the
+// newAPIClient seam), makes polling instant (pollAfter seam), and isolates the
+// on-disk config to a temp dir. All are restored on cleanup.
+func withTestBackend(t *testing.T, h http.HandlerFunc) {
+	t.Helper()
+	srv := httptest.NewServer(h)
+	t.Cleanup(srv.Close)
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+
+	origClient, origAfter := newAPIClient, pollAfter
+	newAPIClient = func(string) *api.Client {
+		return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()}
+	}
+	pollAfter = func(time.Duration) <-chan time.Time {
+		ch := make(chan time.Time, 1)
+		ch <- time.Time{}
+		return ch
+	}
+	t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter })
+}
+
+func runCmd(t *testing.T, args ...string) (string, error) {
+	t.Helper()
+	root := NewRootCmd(BuildInfo{Version: "test"})
+	var out bytes.Buffer
+	root.SetOut(&out)
+	root.SetErr(&out)
+	root.SetArgs(args)
+	err := root.Execute()
+	return out.String(), err
+}
+
+func TestLogin_FullFlow(t *testing.T) {
+	var polls int
+	withTestBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+		case "/device/token":
+			polls++
+			if polls == 1 {
+				w.WriteHeader(http.StatusBadRequest)
+				_, _ = w.Write([]byte(`{"error":"authorization_pending"}`))
+				return
+			}
+			_, _ = w.Write([]byte(`{"token":"cat_abc"}`))
+		case "/userinfo/":
+			if got := r.Header.Get("Authorization"); got != "Bearer cat_abc" {
+				t.Errorf("userinfo auth header = %q, want %q", got, "Bearer cat_abc")
+			}
+			_, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","account":"Acme"}`))
+		default:
+			t.Errorf("unexpected request path %s", r.URL.Path)
+		}
+	})
+
+	out, err := runCmd(t, "login")
+	if err != nil {
+		t.Fatalf("login: %v", err)
+	}
+	if polls != 2 {
+		t.Errorf("expected 2 polls (pending then token), got %d", polls)
+	}
+	cfg, _ := config.Load()
+	if cfg.Token != "cat_abc" {
+		t.Errorf("stored token = %q, want cat_abc", cfg.Token)
+	}
+	if cfg.Email != "ds@tracebloc.io" {
+		t.Errorf("stored email = %q, want ds@tracebloc.io", cfg.Email)
+	}
+	if !strings.Contains(out, "ds@tracebloc.io") {
+		t.Errorf("expected output to show the account, got:\n%s", out)
+	}
+}
+
+// TestLogin_SlowDownBacksOffByFive pins RFC 8628 §3.5: on `slow_down` the poll
+// interval must increase by 5 seconds, not 1. Captures the durations handed to
+// the pollAfter seam.
+func TestLogin_SlowDownBacksOffByFive(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	var polls int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","verification_uri":"https://x/activate","expires_in":600,"interval":5}`))
+		case "/device/token":
+			polls++
+			if polls == 1 {
+				w.WriteHeader(http.StatusBadRequest)
+				_, _ = w.Write([]byte(`{"error":"slow_down"}`))
+				return
+			}
+			_, _ = w.Write([]byte(`{"token":"cat_ok"}`))
+		case "/userinfo/":
+			_, _ = w.Write([]byte(`{"email":"e@co","account":"A"}`))
+		}
+	}))
+	t.Cleanup(srv.Close)
+
+	origClient, origAfter := newAPIClient, pollAfter
+	newAPIClient = func(string) *api.Client { return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()} }
+	var waits []time.Duration
+	pollAfter = func(d time.Duration) <-chan time.Time {
+		waits = append(waits, d)
+		ch := make(chan time.Time, 1)
+		ch <- time.Time{}
+		return ch
+	}
+	t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter })
+
+	if _, err := runCmd(t, "login"); err != nil {
+		t.Fatalf("login: %v", err)
+	}
+	if len(waits) < 2 {
+		t.Fatalf("expected >=2 polls, got waits=%v", waits)
+	}
+	if waits[0] != 5*time.Second {
+		t.Errorf("first poll wait = %v, want 5s (server interval)", waits[0])
+	}
+	if waits[1] != 10*time.Second {
+		t.Errorf("post-slow_down wait = %v, want 10s (interval+5 per RFC 8628), not 6s", waits[1])
+	}
+}
+
+func TestLogin_BackendUnsupported(t *testing.T) {
+	withTestBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	})
+	_, err := runCmd(t, "login")
+	if err == nil || !strings.Contains(err.Error(), "doesn't support browser login") {
+		t.Errorf("want unsupported-backend error, got %v", err)
+	}
+	cfg, _ := config.Load()
+	if cfg.SignedIn() {
+		t.Error("must not store a token when the backend has no device endpoints")
+	}
+}
+
+func TestLogin_Denied(t *testing.T) {
+	withTestBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/device/code":
+			_, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","interval":5}`))
+		default:
+			w.WriteHeader(http.StatusBadRequest)
+			_, _ = w.Write([]byte(`{"error":"access_denied"}`))
+		}
+	})
+	_, err := runCmd(t, "login")
+	if err == nil || !strings.Contains(err.Error(), "denied") {
+		t.Errorf("want access-denied error, got %v", err)
+	}
+}
+
+func TestLogout(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Token: "x", Email: "e@co", ActiveClientID: "7"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := runCmd(t, "logout")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg, _ := config.Load()
+	if cfg.SignedIn() {
+		t.Error("expected to be signed out")
+	}
+	// The active-client pointer is account-scoped — it must not survive logout,
+	// or it bleeds into the next account's session.
+	if cfg.ActiveClientID != "" {
+		t.Errorf("active_client_id = %q after logout, want cleared", cfg.ActiveClientID)
+	}
+	if !strings.Contains(out, "Signed out") {
+		t.Errorf("got:\n%s", out)
+	}
+}
+
+func TestAuthStatus_SignedIn(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Env: "dev", Token: "x", Email: "ds@co"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := runCmd(t, "auth", "status")
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"signed in", "ds@co", "dev"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("status output missing %q, got:\n%s", want, out)
+		}
+	}
+}
+
+func TestAuthStatus_NotSignedIn(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	out, err := runCmd(t, "auth", "status")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(out, "Not signed in") {
+		t.Errorf("got:\n%s", out)
+	}
+}
diff --git a/internal/cli/client.go b/internal/cli/client.go
new file mode 100644
index 0000000..328bbc3
--- /dev/null
+++ b/internal/cli/client.go
@@ -0,0 +1,439 @@
+package cli
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/cluster"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/slug"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// readClusterID reads the cluster's kube-system UID — the RFC-0001 idempotency
+// anchor (§7.2 / backend#883). A package var so tests can stub it without a
+// reachable cluster.
+var readClusterID = cluster.ClusterID
+
+// newClientCmd wires the `tracebloc client` subtree — provisioning + selecting
+// the client (machine) this host enrolls as. Consumes the backend provisioning
+// endpoints (backend#836) with the user token from `tracebloc login`.
+func newClientCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "client",
+		Short: "Provision and manage tracebloc clients (machines)",
+		Long: `Provision a tracebloc client for this machine and list/select clients
+in your account.  Requires sign-in first (` + "`tracebloc login`" + `).`,
+	}
+	cmd.AddCommand(newClientCreateCmd(), newClientListCmd(), newClientUseCmd())
+	return cmd
+}
+
+func newClientCreateCmd() *cobra.Command {
+	var name, location, kubeconfigPath, contextOverride, credentialFile string
+	var yes bool
+	cmd := &cobra.Command{
+		Use:   "create",
+		Short: "Provision a new client for this machine (--name, --location)",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(),
+				clientCreateOpts{name: name, location: location, kubeconfigPath: kubeconfigPath, contextOverride: contextOverride, credentialFile: credentialFile, yes: yes})
+		},
+	}
+	cmd.Flags().StringVar(&name, "name", "",
+		"human-readable client name (shown on your dashboard + carbon reports)")
+	cmd.Flags().StringVar(&location, "location", "",
+		"location zone for carbon footprint (e.g. DE); prompted if omitted")
+	cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "",
+		"path to the kubeconfig for the target cluster (default: $KUBECONFIG, then ~/.kube/config) — read to anchor the client to this cluster")
+	cmd.Flags().StringVar(&contextOverride, "context", "",
+		"kubeconfig context for the target cluster (default: current-context)")
+	cmd.Flags().StringVar(&credentialFile, "credential-file", "",
+		"write the machine credential to this path (mode 0600, sourceable env) instead of printing it — for the installer to feed the chart (never shown on the terminal)")
+	cmd.Flags().BoolVar(&yes, "yes", false, "skip the confirmation prompt")
+	return cmd
+}
+
+// clientCreateOpts bundles the `client create` inputs (flags + resolved prompts).
+type clientCreateOpts struct {
+	name, location, kubeconfigPath, contextOverride, credentialFile string
+	yes                                                             bool
+}
+
+func newClientListCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:     "list",
+		Aliases: []string{"ls"},
+		Short:   "List the clients in your account",
+		Args:    cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClientList(cmd.Context(), printerFor(cmd))
+		},
+	}
+}
+
+func newClientUseCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "use <client-id>",
+		Short: "Enroll this machine as an existing client",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return runClientUse(cmd.Context(), printerFor(cmd), args[0])
+		},
+	}
+}
+
+// clientPrompter returns the interactive prompter on a TTY, else nil (so
+// commands fall back to flags-only and never block on a pipe / in CI).
+func clientPrompter() prompter {
+	if isInteractiveTTY() {
+		return surveyPrompter{}
+	}
+	return nil
+}
+
+// authedClient loads the signed-in config and returns a token-bearing API
+// client, or an error telling the user to log in.
+func authedClient() (*api.Client, *config.Config, error) {
+	cfg, err := config.Load()
+	if err != nil {
+		return nil, nil, err
+	}
+	if !cfg.SignedIn() {
+		return nil, nil, errors.New("not signed in — run `tracebloc login` first")
+	}
+	env := cfg.Env
+	if env == "" {
+		env = api.ResolveEnv("")
+	}
+	client := newAPIClient(env)
+	client.Token = cfg.Token
+	return client, cfg, nil
+}
+
+func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clientCreateOpts) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+
+	name, location := opts.name, opts.location
+
+	// Gather inputs first (flags win; prompt only what's missing, and only on a
+	// TTY), then show one review + confirm — matching the dataset-push flow.
+	if name == "" {
+		if pr == nil {
+			return errMissingFlag("--name")
+		}
+		if name, err = pr.Input("Client name", "shown on your dashboard + carbon reports", "", validateNonEmpty); err != nil {
+			return mapClientErr(err)
+		}
+	}
+	if location == "" {
+		if pr == nil {
+			return errMissingFlag("--location")
+		}
+		// Never silent-empty: the prompt requires a non-empty zone. (Cloud /
+		// GeoIP auto-detect of a suggested default is a fast-follow.)
+		if location, err = pr.Input("Location zone (e.g. DE)", "physical zone, for the carbon footprint", "", validateNonEmpty); err != nil {
+			return mapClientErr(err)
+		}
+	}
+
+	// Read the cluster anchor (kube-system UID) so create is get-or-create keyed on
+	// it — re-running on the same cluster adopts the existing client instead of
+	// minting a duplicate (RFC-0001 §7.2 / backend#883). Best-effort + never silent:
+	// if the cluster isn't reachable we provision WITHOUT an anchor (a plain mint)
+	// and say so, rather than blocking.
+	clusterID, cidErr := readClusterID(ctx, cluster.KubeconfigOptions{Path: opts.kubeconfigPath, Context: opts.contextOverride})
+	if cidErr != nil {
+		p.Hintf("Couldn't read the target cluster's identity — provisioning without a cluster anchor, so re-running won't be idempotent. Point --kubeconfig/--context at the reachable cluster to enable that.")
+	}
+
+	// Derive the namespace slug from the name, avoiding collisions with existing
+	// clients (best-effort: if the list call fails we still derive a base slug).
+	var existing []string
+	if clients, lerr := client.ListClients(ctx); lerr == nil {
+		for _, c := range clients {
+			// Skip the client already anchored to this cluster: a re-run adopts it
+			// (the backend keys on cluster_id), so its namespace isn't a collision.
+			// Counting it would bump the derived slug and show a namespace in the
+			// review that doesn't match the one actually adopted.
+			if clusterID != "" && c.ClusterID == clusterID {
+				continue
+			}
+			if c.Namespace != "" {
+				existing = append(existing, c.Namespace)
+			}
+		}
+	}
+	namespace, err := slug.Derive(name, existing, "client-"+randHex(4))
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+
+	if pr != nil && !opts.yes {
+		renderClientReview(p, name, namespace, location, clusterID)
+		ok, cerr := pr.Confirm("Provision this client?", true)
+		if cerr != nil {
+			return mapClientErr(cerr)
+		}
+		if !ok {
+			p.Hintf("Cancelled.")
+			return nil
+		}
+	}
+
+	// The machine credential: the CLI generates the password, the backend stores
+	// it (write-only). Sent on every create but used only when minting — on an
+	// idempotent adopt the backend keeps the existing client's credential (§7.2),
+	// so the generated value is never printed in that case.
+	password := randHex(24)
+	pc, adopted, err := client.CreateClient(ctx, api.CreateClientRequest{
+		Name:      name,
+		Namespace: namespace,
+		Location:  location,
+		Password:  password,
+		ClusterID: clusterID,
+	})
+	if err != nil {
+		var ae *api.APIError
+		if errors.As(err, &ae) {
+			switch ae.StatusCode {
+			case http.StatusForbidden:
+				return askAnAdmin(ctx, p, client)
+			case http.StatusConflict:
+				// Per RFC C.3 the only 409 on POST /edge-device/ is cluster_conflict
+				// (R6): this cluster_id is bound to another account.
+				return &exitError{code: 1, err: errors.New(
+					"this cluster is already registered to a different tracebloc account — " +
+						"sign in to that account, or ask your admin (cluster_conflict)")}
+			}
+		}
+		return &exitError{code: 1, err: err}
+	}
+
+	cfg.ActiveClientID = strconv.Itoa(pc.ID)
+
+	p.Newline()
+	if adopted {
+		// Idempotent re-run: the backend matched this cluster_id to an existing
+		// client and returned it — no new credential. (The existing-fleet R7 case,
+		// where the backend instead matches a live in-cluster TB_CLIENT_ID whose
+		// cluster_id is still null and the CLI backfills it via PATCH, is the
+		// installer's orchestration — #838 — not done here.)
+		p.Successf("This cluster is already registered as client %q (namespace %s) — adopted it.", pc.Name, pc.Namespace)
+		p.Hintf("No new credential issued; the existing one stands. This machine is set to enroll as client %d.", pc.ID)
+		if opts.credentialFile != "" {
+			// No password to hand over on adopt (it's write-only on the backend and
+			// the existing one stands). Emit id + namespace + an ADOPTED marker so the
+			// installer reconciles the existing release rather than expecting a fresh
+			// credential (#838).
+			if werr := writeClientCredential(opts.credentialFile, []string{
+				"TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID),
+				"TB_NAMESPACE=" + pc.Namespace,
+				"TRACEBLOC_CLIENT_ADOPTED=1",
+			}); werr != nil {
+				return &exitError{code: 1, err: werr}
+			}
+			p.Hintf("Wrote client id + namespace to %s (no new credential — the existing one stands).", opts.credentialFile)
+		}
+		// Mirror the mint path: a config-save failure shouldn't bury the result —
+		// hint how to set the pointer by hand and still exit clean.
+		if serr := cfg.Save(); serr != nil {
+			p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID)
+		}
+		return nil
+	}
+	// Mint. With --credential-file the secret goes to a 0600 file (never the
+	// terminal — RFC §9 "secure by invisibility") for the installer to source;
+	// otherwise it's printed (the interim, until the installer drives this).
+	p.Successf("Provisioned client %q (namespace %s).", pc.Name, pc.Namespace)
+	if opts.credentialFile != "" {
+		if werr := writeClientCredential(opts.credentialFile, []string{
+			"TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID),
+			"TRACEBLOC_CLIENT_PASSWORD=" + password,
+			"TB_NAMESPACE=" + pc.Namespace,
+		}); werr != nil {
+			// The credential is the only copy — a write failure must be fatal, not a
+			// silent drop (the installer would have nothing to connect with).
+			return &exitError{code: 1, err: werr}
+		}
+		p.Hintf("Credential written to %s (mode 0600, not shown). This machine is set to enroll as client %d.", opts.credentialFile, pc.ID)
+	} else {
+		// Print the credential FIRST — it's the only copy (the backend stores only
+		// the hash), so a later config-save failure must never cost it.
+		p.Section("Machine credential — needed by the installer to connect this client")
+		p.Field("client id", strconv.Itoa(pc.ID))
+		p.Field("username", pc.Username)
+		p.Field("password", password)
+	}
+	if serr := cfg.Save(); serr != nil {
+		p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID)
+	}
+	return nil
+}
+
+// writeClientCredential writes the machine credential to path (mode 0600) as a
+// shell-sourceable env file — the installer (#838) sources it to feed the chart,
+// so the secret lands in a 0600 file, never the terminal (RFC §9 never-show). The
+// values are constrained charsets (numeric id, hex password, DNS-1123 slug), so
+// no shell-escaping is needed.
+//
+// Written via a 0600 temp file + atomic rename rather than os.WriteFile: WriteFile
+// only applies its perm bits when it *creates* the file, so a pre-existing target
+// (a stale file, or one an attacker pre-creates world-readable) would keep its old
+// mode and leak the secret — the 0600 guarantee must hold unconditionally. The temp
+// also avoids following a symlink at the target and never leaves a half-written
+// credential behind.
+func writeClientCredential(path string, lines []string) error {
+	dir := filepath.Dir(path)
+	if dir != "" && dir != "." {
+		if err := os.MkdirAll(dir, 0o700); err != nil {
+			return fmt.Errorf("creating credential-file directory: %w", err)
+		}
+	}
+	body := "# tracebloc client credential — written by `tracebloc client create`.\n" +
+		"# Mode 0600; sourced by the installer. Do not commit or share.\n" +
+		strings.Join(lines, "\n") + "\n"
+	// CreateTemp makes the file 0600 by construction, in the target dir so the
+	// rename stays on one filesystem.
+	f, err := os.CreateTemp(dir, ".cred-*")
+	if err != nil {
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	tmp := f.Name()
+	if _, err := f.WriteString(body); err != nil {
+		_ = f.Close()
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	if err := f.Close(); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("writing credential file %s: %w", path, err)
+	}
+	return nil
+}
+
+// askAnAdmin renders the "you can't provision — here's who can" path (a 403 from
+// the backend means no CLIENT_WRITE; backend#836 Q4).
+func askAnAdmin(ctx context.Context, p *ui.Printer, client *api.Client) error {
+	p.Newline()
+	p.Hintf("You don't have permission to provision a client in this account.")
+	if admins, err := client.ListClientAdmins(ctx); err == nil && len(admins) > 0 {
+		p.Section("Ask one of these admins to provision it (or grant you access)")
+		for _, a := range admins {
+			label := a.Name
+			if label == "" {
+				label = a.Email
+			}
+			p.Field(label, a.Email)
+		}
+	}
+	return &exitError{code: 1, err: errors.New("provisioning requires CLIENT_WRITE permission")}
+}
+
+func runClientList(ctx context.Context, p *ui.Printer) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	clients, err := client.ListClients(ctx)
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	if len(clients) == 0 {
+		p.Hintf("No clients yet. Run `tracebloc client create`.")
+		return nil
+	}
+	p.Section("Clients in your account")
+	for _, c := range clients {
+		marker := ""
+		if strconv.Itoa(c.ID) == cfg.ActiveClientID {
+			marker = "  (active)"
+		}
+		p.Field(strconv.Itoa(c.ID)+marker,
+			fmt.Sprintf("%s   namespace=%s   location=%s", c.Name, c.Namespace, c.Location))
+	}
+	return nil
+}
+
+func runClientUse(ctx context.Context, p *ui.Printer, id string) error {
+	client, cfg, err := authedClient()
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	clients, err := client.ListClients(ctx)
+	if err != nil {
+		return &exitError{code: 1, err: err}
+	}
+	for _, c := range clients {
+		if strconv.Itoa(c.ID) == id {
+			cfg.ActiveClientID = id
+			if serr := cfg.Save(); serr != nil {
+				return &exitError{code: 1, err: serr}
+			}
+			p.Successf("This machine is now set to enroll as client %s (%s).", id, c.Name)
+			return nil
+		}
+	}
+	return &exitError{code: 1, err: fmt.Errorf(
+		"no client %s in your account — run `tracebloc client list` to see the ids", id)}
+}
+
+// renderClientReview shows the assembled inputs before the confirm prompt, so
+// the user sees the derived namespace and location before anything is created.
+func renderClientReview(p *ui.Printer, name, namespace, location, clusterID string) {
+	p.Section("Review")
+	p.Field("name", name)
+	p.Field("namespace", namespace)
+	p.Field("location", location)
+	if clusterID != "" {
+		p.Field("cluster", clusterID+"  (anchors this client — re-runs adopt it)")
+	}
+}
+
+// errMissingFlag reports a required flag absent in a non-interactive run (no TTY
+// to prompt — CI, a pipe, or output redirected).
+func errMissingFlag(flag string) error {
+	return &exitError{code: 1, err: fmt.Errorf("%s is required (non-interactive — no TTY to prompt)", flag)}
+}
+
+// validateNonEmpty rejects blank prompt input.
+func validateNonEmpty(s string) error {
+	if strings.TrimSpace(s) == "" {
+		return errors.New("required")
+	}
+	return nil
+}
+
+// mapClientErr turns a cancelled interactive prompt into a clean exit.
+func mapClientErr(err error) error {
+	if errors.Is(err, errInteractiveCancelled) {
+		return nil
+	}
+	return &exitError{code: 1, err: err}
+}
+
+// randHex returns nbytes of crypto-random data hex-encoded.
+func randHex(nbytes int) string {
+	b := make([]byte, nbytes)
+	_, _ = rand.Read(b) // crypto/rand.Read does not fail on a healthy system
+	return hex.EncodeToString(b)
+}
diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go
new file mode 100644
index 0000000..dcdcc1c
--- /dev/null
+++ b/internal/cli/client_test.go
@@ -0,0 +1,551 @@
+package cli
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/tracebloc/cli/internal/api"
+	"github.com/tracebloc/cli/internal/cluster"
+	"github.com/tracebloc/cli/internal/config"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// withClientBackend points the client commands at an httptest server (via the
+// newAPIClient seam) and writes a signed-in config to a temp dir. It also stubs
+// readClusterID to "no cluster" by default, so create tests never touch a real
+// kubeconfig/cluster — tests that exercise the anchor override it via stubClusterID.
+func withClientBackend(t *testing.T, h http.HandlerFunc) {
+	t.Helper()
+	srv := httptest.NewServer(h)
+	t.Cleanup(srv.Close)
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&config.Config{Env: "dev", Token: "tok"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	orig := newAPIClient
+	newAPIClient = func(string) *api.Client {
+		return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()}
+	}
+	t.Cleanup(func() { newAPIClient = orig })
+
+	origCID := readClusterID
+	readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) {
+		return "", errors.New("no cluster reachable (test default)")
+	}
+	t.Cleanup(func() { readClusterID = origCID })
+}
+
+// stubClusterID overrides the cluster-anchor read for a single test.
+func stubClusterID(t *testing.T, uid string, err error) {
+	t.Helper()
+	orig := readClusterID
+	readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) {
+		return uid, err
+	}
+	t.Cleanup(func() { readClusterID = orig })
+}
+
+func TestClientCreate_Success(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			_, _ = w.Write([]byte(`[]`)) // no existing clients
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			if got := r.Header.Get("Authorization"); got != "Bearer tok" {
+				t.Errorf("auth header = %q, want Bearer tok", got)
+			}
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"my-client","username":"u-123","namespace":"my-client","location":"DE"}`))
+		default:
+			t.Errorf("unexpected %s %s", r.Method, r.URL.Path)
+		}
+	})
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.Namespace != "my-client" || body.Location != "DE" || body.Password == "" {
+		t.Errorf("create body = %+v", body)
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "5" {
+		t.Errorf("active client = %q, want 5", cfg.ActiveClientID)
+	}
+	if !strings.Contains(out.String(), "u-123") {
+		t.Errorf("output missing username:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_AskAnAdmin(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/edge-device/" && r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.URL.Path == "/edge-device/" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusForbidden)
+			_, _ = w.Write([]byte(`{"detail":"no permission"}`))
+		case r.URL.Path == "/edge-device/admins/":
+			_, _ = w.Write([]byte(`[{"name":"Ada","email":"ada@co.io"}]`))
+		}
+	})
+	var out bytes.Buffer
+	err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true})
+	if err == nil || !strings.Contains(err.Error(), "CLIENT_WRITE") {
+		t.Errorf("want permission error, got %v", err)
+	}
+	if !strings.Contains(out.String(), "ada@co.io") {
+		t.Errorf("expected admins shown, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_RequiresLogin(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) // no config → not signed in
+	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "x", location: "DE", yes: true})
+	if err == nil || !strings.Contains(err.Error(), "login") {
+		t.Errorf("want not-signed-in error, got %v", err)
+	}
+}
+
+func TestClientList(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":1,"first_name":"alpha","namespace":"alpha","location":"DE"},{"id":2,"first_name":"beta","namespace":"beta","location":"US"}]`))
+	})
+	var out bytes.Buffer
+	if err := runClientList(context.Background(), ui.New(&out)); err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"alpha", "beta"} {
+		if !strings.Contains(out.String(), want) {
+			t.Errorf("list missing %q:\n%s", want, out.String())
+		}
+	}
+}
+
+func TestClientUse(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":7,"first_name":"gamma","namespace":"gamma"}]`))
+	})
+	if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "7"); err != nil {
+		t.Fatal(err)
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "7" {
+		t.Errorf("active = %q, want 7", cfg.ActiveClientID)
+	}
+	if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "99"); err == nil {
+		t.Error("expected an error for an unknown client id")
+	}
+}
+
+func TestClientCreate_Interactive(t *testing.T) {
+	var body api.CreateClientRequest
+	posted := false
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			posted = true
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":9,"first_name":"Lab One","username":"u-9","namespace":"lab-one","location":"DE"}`))
+		}
+	})
+	confirmYes := true
+	pr := &fakePrompter{
+		answers: map[string]string{
+			"Client name":             "Lab One",
+			"Location zone (e.g. DE)": "DE",
+		},
+		confirm: &confirmYes,
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil {
+		t.Fatalf("interactive create: %v", err)
+	}
+	if !posted {
+		t.Fatal("expected a POST after the user confirmed")
+	}
+	if body.Name != "Lab One" || body.Namespace != "lab-one" || body.Location != "DE" {
+		t.Errorf("create body = %+v", body)
+	}
+	if !strings.Contains(out.String(), "Review") {
+		t.Errorf("expected a review section before the confirm, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_InteractiveCancel(t *testing.T) {
+	posted := false
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		if r.Method == http.MethodPost {
+			posted = true
+		}
+		_, _ = w.Write([]byte(`[]`))
+	})
+	confirmNo := false
+	pr := &fakePrompter{
+		answers: map[string]string{
+			"Client name":             "Lab Two",
+			"Location zone (e.g. DE)": "US",
+		},
+		confirm: &confirmNo,
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil {
+		t.Fatalf("declining the confirm should be a clean exit, got: %v", err)
+	}
+	if posted {
+		t.Error("no client should be created when the user declines the confirm")
+	}
+	if !strings.Contains(out.String(), "Cancelled") {
+		t.Errorf("expected a Cancelled note, got:\n%s", out.String())
+	}
+}
+
+func TestClientList_Paginated(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Query().Get("page") == "2" {
+			_, _ = w.Write([]byte(`{"count":2,"next":null,"results":[{"id":2,"first_name":"beta","namespace":"beta"}]}`))
+			return
+		}
+		// page 1: an absolute `next` link, like real DRF pagination
+		_, _ = fmt.Fprintf(w, `{"count":2,"next":"http://%s/edge-device/?page=2","results":[{"id":1,"first_name":"alpha","namespace":"alpha"}]}`, r.Host)
+	})
+	var out bytes.Buffer
+	if err := runClientList(context.Background(), ui.New(&out)); err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{"alpha", "beta"} {
+		if !strings.Contains(out.String(), want) {
+			t.Errorf("paginated list missing %q (next not followed?):\n%s", want, out.String())
+		}
+	}
+}
+
+func TestClientCreate_CollisionSuffix(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			// an existing client already holds "my-client"
+			_, _ = w.Write([]byte(`[{"id":1,"first_name":"My Client","namespace":"my-client"}]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":2,"first_name":"My Client","username":"u-2","namespace":"my-client-2","location":"DE"}`))
+		}
+	})
+	if err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "My Client", location: "DE", yes: true}); err != nil {
+		t.Fatal(err)
+	}
+	if body.Namespace != "my-client-2" {
+		t.Errorf("namespace = %q, want my-client-2 (collision suffix not applied)", body.Namespace)
+	}
+}
+
+func TestClientCreate_AnchorMint(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated) // 201 = minted
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"c","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.ClusterID != "uid-1" {
+		t.Errorf("cluster_id sent = %q, want uid-1 (anchor not wired into the request)", body.ClusterID)
+	}
+	if !strings.Contains(out.String(), "Machine credential") {
+		t.Errorf("mint should print the credential, got:\n%s", out.String())
+	}
+}
+
+func TestClientCreate_AdoptIdempotent(t *testing.T) {
+	posts := 0
+	var lastBody api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			posts++
+			_ = json.NewDecoder(r.Body).Decode(&lastBody)
+			w.WriteHeader(http.StatusOK) // 200 = adopted an existing client
+			_, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"existing","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+
+	run := func() string {
+		var out bytes.Buffer
+		if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+			t.Fatalf("adopt: %v", err)
+		}
+		return out.String()
+	}
+	// Two runs against the same cluster — real re-run idempotency: each POSTs once,
+	// both adopt the SAME existing client, neither prints a credential.
+	first, second := run(), run()
+	if posts != 2 {
+		t.Fatalf("posts = %d, want 2 (each run POSTs once)", posts)
+	}
+	for i, out := range []string{first, second} {
+		if !strings.Contains(out, "adopted") {
+			t.Errorf("run %d: adopt should say so, got:\n%s", i, out)
+		}
+		if strings.Contains(out, "Machine credential") {
+			t.Errorf("run %d: adopt must NOT print a credential, got:\n%s", i, out)
+		}
+	}
+	// The credential is still SENT on every create (the backend uses it only on a
+	// mint, §7.2) even though it's never printed on adopt.
+	if lastBody.Password == "" {
+		t.Error("password should still be sent in the adopt POST body")
+	}
+	cfg, _ := config.Load()
+	if cfg.ActiveClientID != "8" {
+		t.Errorf("active client = %q, want 8 (adopted id)", cfg.ActiveClientID)
+	}
+}
+
+func TestClientCreate_CredentialFileMint(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	// never-show: the secret must NOT hit the terminal.
+	if strings.Contains(out.String(), "Machine credential") || strings.Contains(out.String(), "password") {
+		t.Errorf("credential must not be printed when --credential-file is set, got:\n%s", out.String())
+	}
+	// the file is 0600 and carries the sourceable credential.
+	info, err := os.Stat(credPath)
+	if err != nil {
+		t.Fatalf("credential file not written: %v", err)
+	}
+	if perm := info.Mode().Perm(); perm != 0o600 {
+		t.Errorf("credential file mode = %o, want 600", perm)
+	}
+	kv := parseEnvFile(t, credPath)
+	if kv["TRACEBLOC_CLIENT_ID"] != "5" || kv["TB_NAMESPACE"] != "my-ns" || kv["TRACEBLOC_CLIENT_PASSWORD"] == "" {
+		t.Errorf("credential file = %v (want id=5, ns=my-ns, non-empty password)", kv)
+	}
+	// never-show, the real invariant: the minted password VALUE must not appear
+	// in stdout under any label (the string checks above are just a proxy).
+	if strings.Contains(out.String(), kv["TRACEBLOC_CLIENT_PASSWORD"]) {
+		t.Errorf("minted password leaked to the terminal:\n%s", out.String())
+	}
+}
+
+// TestClientCreate_CredentialFilePreexistingPerms locks in the 0600 guarantee
+// when the target already exists with looser perms — os.WriteFile would have
+// kept the stale mode and leaked the secret group/other-readable.
+func TestClientCreate_CredentialFilePreexistingPerms(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	// A stale, world-readable file already sits at the target path.
+	if err := os.WriteFile(credPath, []byte("stale\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	info, err := os.Stat(credPath)
+	if err != nil {
+		t.Fatalf("credential file not written: %v", err)
+	}
+	if perm := info.Mode().Perm(); perm != 0o600 {
+		t.Errorf("credential file mode = %o over a pre-existing 0644 target, want 600", perm)
+	}
+}
+
+// TestClientCreate_CredentialFileWriteFailFatal asserts a credential-file write
+// failure is fatal — the minted password is the only copy, so a failed write must
+// surface an error, never a silent drop. The target's parent is a regular file, so
+// the directory create (hence the write) fails deterministically.
+func TestClientCreate_CredentialFileWriteFailFatal(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	notADir := filepath.Join(t.TempDir(), "iam-a-file")
+	if err := os.WriteFile(notADir, []byte("x"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+	credPath := filepath.Join(notADir, "cred.env") // parent is a file → write fails
+	var out bytes.Buffer
+	err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath})
+	if err == nil {
+		t.Fatal("expected a fatal error when the credential file can't be written, got nil")
+	}
+}
+
+func TestClientCreate_CredentialFileAdopt(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK) // adopt
+			_, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"ex-ns","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	credPath := filepath.Join(t.TempDir(), "cred.env")
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil,
+		clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil {
+		t.Fatalf("adopt: %v", err)
+	}
+	kv := parseEnvFile(t, credPath)
+	// adopt emits id + namespace + the ADOPTED marker, but NO password (the
+	// existing one stands; it's write-only on the backend).
+	if kv["TRACEBLOC_CLIENT_ID"] != "8" || kv["TB_NAMESPACE"] != "ex-ns" || kv["TRACEBLOC_CLIENT_ADOPTED"] != "1" {
+		t.Errorf("adopt credential file = %v (want id=8, ns=ex-ns, ADOPTED=1)", kv)
+	}
+	if _, hasPw := kv["TRACEBLOC_CLIENT_PASSWORD"]; hasPw {
+		t.Errorf("adopt must not write a password (none issued), got:\n%v", kv)
+	}
+}
+
+// parseEnvFile reads a KEY=value env file (skipping # comments) into a map.
+func parseEnvFile(t *testing.T, path string) map[string]string {
+	t.Helper()
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read credential file: %v", err)
+	}
+	kv := map[string]string{}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		if k, v, ok := strings.Cut(line, "="); ok {
+			kv[k] = v
+		}
+	}
+	return kv
+}
+
+func TestClientCreate_ClusterConflict(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusConflict) // 409 = bound to another account (R6)
+			_, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "c", location: "DE", yes: true})
+	if err == nil || !strings.Contains(err.Error(), "different tracebloc account") {
+		t.Errorf("want a cluster_conflict error, got %v", err)
+	}
+}
+
+func TestClientCreate_NoClusterAnchorWarns(t *testing.T) {
+	var body api.CreateClientRequest
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet:
+			_, _ = w.Write([]byte(`[]`))
+		case r.Method == http.MethodPost:
+			_ = json.NewDecoder(r.Body).Decode(&body)
+			w.WriteHeader(http.StatusCreated)
+			_, _ = w.Write([]byte(`{"id":3,"first_name":"c","username":"u-3","namespace":"c","location":"DE"}`))
+		}
+	})
+	// readClusterID left at the withClientBackend default (returns an error).
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if body.ClusterID != "" {
+		t.Errorf("cluster_id sent = %q, want empty (no anchor when cluster unreadable)", body.ClusterID)
+	}
+	if !strings.Contains(out.String(), "without a cluster anchor") {
+		t.Errorf("expected a never-silent hint about the missing anchor, got:\n%s", out.String())
+	}
+	// The no-anchor path must still complete a full mint — the credential is shown.
+	if !strings.Contains(out.String(), "Machine credential") {
+		t.Errorf("no-anchor fallback should still print the credential, got:\n%s", out.String())
+	}
+}
+
+// TestClientCreate_ReRunReviewShowsAdoptedNamespace pins that on an idempotent
+// re-run, the client already anchored to this cluster is excluded from collision
+// detection — so the review shows the namespace that's actually adopted
+// (lab-one), not a bumped lab-one-2.
+func TestClientCreate_ReRunReviewShowsAdoptedNamespace(t *testing.T) {
+	withClientBackend(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.Method == http.MethodGet && r.URL.Path == "/edge-device/":
+			// An existing client already anchored to THIS cluster (uid-1).
+			_, _ = w.Write([]byte(`[{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}]`))
+		case r.Method == http.MethodPost && r.URL.Path == "/edge-device/":
+			w.WriteHeader(http.StatusOK) // adopt
+			_, _ = w.Write([]byte(`{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}`))
+		}
+	})
+	stubClusterID(t, "uid-1", nil)
+	confirmYes := true
+	pr := &fakePrompter{answers: map[string]string{}, confirm: &confirmYes}
+	var out bytes.Buffer
+	if err := runClientCreate(context.Background(), ui.New(&out), pr,
+		clientCreateOpts{name: "Lab One", location: "DE"}); err != nil {
+		t.Fatalf("re-run create: %v", err)
+	}
+	if strings.Contains(out.String(), "lab-one-2") {
+		t.Errorf("review showed a bumped namespace — the cluster's own client wasn't excluded from collision detection:\n%s", out.String())
+	}
+}
diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go
index cea1b27..db2cd7a 100644
--- a/internal/cli/cluster.go
+++ b/internal/cli/cluster.go
@@ -13,11 +13,13 @@ import (
 	"github.com/tracebloc/cli/internal/ui"
 )
 
-// newClusterCmd wires the `tracebloc cluster` subtree. Today it has
-// a single verb — `info` — which is the customer's "is the CLI
-// pointing at the right cluster?" pre-flight before running
-// `dataset push`. Future verbs (e.g. `cluster doctor` for
-// diagnostics, `cluster contexts` for switching) hang off this
+// newClusterCmd wires the `tracebloc cluster` subtree:
+//   - `info`   — the customer's "is the CLI pointing at the right
+//     cluster?" pre-flight before running `dataset push`.
+//   - `doctor` — a read-only health sweep of the running release with
+//     ✔/⚠/✖ checks + remedies (epic client-runtime#116, WS3).
+//
+// Future verbs (e.g. `cluster contexts` for switching) hang off this
 // parent in later phases.
 func newClusterCmd() *cobra.Command {
 	cmd := &cobra.Command{
@@ -33,6 +35,7 @@ the wrong cluster).`,
 	}
 
 	cmd.AddCommand(newClusterInfoCmd())
+	cmd.AddCommand(newClusterDoctorCmd())
 	return cmd
 }
 
diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go
index 1b63d92..012fd2c 100644
--- a/internal/cli/dataset.go
+++ b/internal/cli/dataset.go
@@ -222,8 +222,7 @@ Exit codes:
 	cmd.Flags().StringVar(&table, "table", "",
 		"destination table name (MySQL identifier; matches /data/shared/<table>/ on the PVC)")
 	cmd.Flags().StringVar(&category, "category", "image_classification",
-		"task category: image_classification, tabular_classification, tabular_regression, "+
-			"time_series_forecasting, time_to_event_prediction")
+		"task category, one of: "+push.SupportedCategoriesList())
 	cmd.Flags().StringVar(&intent, "intent", "",
 		"intent: train|test")
 	cmd.Flags().StringVar(&labelColumn, "label-column", "",
@@ -422,24 +421,23 @@ contributors train against it without ever seeing the raw files.`))
 	case a.Spec.Category == "":
 		// Left empty by a caller; let the schema produce the canonical
 		// "category is required" error downstream.
-	case push.IsTabular(a.Spec.Category) || push.IsText(a.Spec.Category) ||
-		a.Spec.Category == "image_classification" ||
-		a.Spec.Category == "object_detection" ||
-		a.Spec.Category == "keypoint_detection":
+	case push.IsCLISupported(a.Spec.Category):
 		// supported
-	case push.IsImage(a.Spec.Category):
-		// semantic_segmentation / instance_segmentation
+	case push.IsKnown(a.Spec.Category):
+		// A recognized category dataset push doesn't implement yet — image
+		// (semantic_segmentation / instance_segmentation) or text
+		// (causal_language_modeling). Routed here (not the default branch) so the
+		// user gets the registry's per-category pending-support reason, not a
+		// misleading "unrecognized category". Supported categories were already
+		// caught above, so IsKnown here means known-but-unsupported.
+		spec, _ := push.Lookup(a.Spec.Category)
 		return &exitError{code: 2, err: fmt.Errorf(
-			"category %q isn't supported by the CLI yet. semantic_segmentation is "+
-				"blocked on the ingestor's mask-sidecar support (data-ingestors#136), and "+
-				"instance_segmentation isn't implemented. Supported image categories: "+
-				"image_classification, object_detection, keypoint_detection.", a.Spec.Category)}
+			"category %q isn't supported by the CLI yet (%s). Supported categories: %s.",
+			a.Spec.Category, spec.UnsupportedNote, push.SupportedCategoriesList())}
 	default:
 		return &exitError{code: 2, err: fmt.Errorf(
-			"category %q isn't a recognized task category. Supported: image_classification, "+
-				"object_detection, keypoint_detection, text_classification, "+
-				"masked_language_modeling, tabular_classification, tabular_regression, "+
-				"time_series_forecasting, time_to_event_prediction.", a.Spec.Category)}
+			"category %q isn't a recognized task category. Supported categories: %s.",
+			a.Spec.Category, push.SupportedCategoriesList())}
 	}
 
 	// 3. Walk the local directory FIRST (local "fail fast"), dispatched
diff --git a/internal/cli/dataset_rm.go b/internal/cli/dataset_rm.go
index a7c398f..ecc83f6 100644
--- a/internal/cli/dataset_rm.go
+++ b/internal/cli/dataset_rm.go
@@ -178,18 +178,30 @@ undone — re-pushing the data is the only way back.`)
 		}
 	}
 
-	// 7. Execute the in-cluster teardown.
+	// 7. Execute the in-cluster teardown. File removal runs in a
+	//    short-lived pod that shares the stage pod's identity (uid
+	//    65532 + fsGroup 65532), so it owns and can delete the staging
+	//    files on any volume type — including hostPath, where fsGroup is
+	//    a no-op (tracebloc/client#259).
 	p.Infof("Removing in-cluster artifacts…")
-	res, err := push.Teardown(ctx, cs, resolved.RestConfig, resolved.Namespace, plan)
+	res, err := push.Teardown(ctx, cs, &push.SPDYExecutor{Config: resolved.RestConfig, Client: cs}, resolved.Namespace, plan, push.PodSpecOptions{
+		Namespace:          resolved.Namespace,
+		PVCClaimName:       pvc.ClaimName,
+		PVCMountPath:       pvc.MountPath,
+		Table:              a.Table,
+		ServiceAccountName: release.IngestorSAName,
+		// Image left empty → push.DefaultStagePodImage (alpine; has rm).
+	})
 	if err != nil {
-		// Teardown is two sequential destructive ops; if the table drop
-		// succeeded but file removal didn't, say so — both ops are
-		// idempotent, so re-running completes the cleanup. (Bugbot #49)
+		// Two sequential destructive ops. If the table dropped but file
+		// removal didn't, the drop is idempotent (DROP TABLE IF EXISTS),
+		// so re-running is safe; if it keeps failing, remove the leftover
+		// staging dirs on the node directly.
 		if res.DroppedTable {
 			return &exitError{code: 7, err: fmt.Errorf(
-				"teardown incomplete — the table %s.%s was dropped, but removing its files failed; "+
-					"re-run `tracebloc dataset rm %s` to remove the leftover files: %w",
-				plan.Database, plan.Table, a.Table, err)}
+				"teardown incomplete — the table %s.%s was dropped, but removing its files failed: %w; "+
+					"re-run `tracebloc dataset rm %s`, or delete the leftover staging dirs on the node",
+				plan.Database, plan.Table, err, a.Table)}
 		}
 		return &exitError{code: 7, err: fmt.Errorf("teardown failed: %w", err)}
 	}
diff --git a/internal/cli/dataset_test.go b/internal/cli/dataset_test.go
index 5d5cfcf..b8415b5 100644
--- a/internal/cli/dataset_test.go
+++ b/internal/cli/dataset_test.go
@@ -93,6 +93,37 @@ func TestDatasetPush_UnsupportedCategory_ExitsTwo(t *testing.T) {
 	}
 }
 
+// TestDatasetPush_KnownUnsupportedCategory_PendingNote pins the Bugbot fix
+// (v0.4.0 RC): a registry-known but CLI-unsupported NON-image category
+// (causal_language_modeling) must get the registry's pending-support note, not
+// the misleading "isn't a recognized task category" message. execDatasetPush
+// discards the error and SilenceErrors swallows it, so run the command here and
+// inspect the returned error directly.
+func TestDatasetPush_KnownUnsupportedCategory_PendingNote(t *testing.T) {
+	root := imgcLayout(t)
+	rootCmd := NewRootCmd(BuildInfo{Version: "test"})
+	rootCmd.SetOut(&bytes.Buffer{})
+	rootCmd.SetErr(&bytes.Buffer{})
+	rootCmd.SetArgs([]string{"dataset", "push",
+		"--kubeconfig=/tmp/tracebloc-cli-test-nonexistent-" + t.Name(),
+		root, "--table=t1", "--category=causal_language_modeling",
+		"--intent=train", "--label-column=label"})
+	err := rootCmd.Execute()
+	if err == nil {
+		t.Fatal("expected an error for a known-but-unsupported category")
+	}
+	if got := ExitCodeFromError(err); got != 2 {
+		t.Fatalf("exit code = %d, want 2", got)
+	}
+	msg := err.Error()
+	if strings.Contains(msg, "isn't a recognized task category") {
+		t.Errorf("known category misrouted to the unrecognized-category branch:\n%s", msg)
+	}
+	if !strings.Contains(msg, "isn't supported by the CLI yet") {
+		t.Errorf("want the registry pending-support note, got:\n%s", msg)
+	}
+}
+
 // TestDatasetPush_TraversalTableName_ExitsTwo is the security
 // regression pin at the CLI layer. --table=../../etc must be
 // rejected with exit 2 BEFORE any spec synthesis or cluster work —
diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go
new file mode 100644
index 0000000..71a484f
--- /dev/null
+++ b/internal/cli/doctor.go
@@ -0,0 +1,133 @@
+package cli
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tracebloc/cli/internal/cluster"
+	"github.com/tracebloc/cli/internal/doctor"
+	"github.com/tracebloc/cli/internal/ui"
+)
+
+// newClusterDoctorCmd implements `tracebloc cluster doctor` — the sibling of
+// `cluster info` that cluster.go's doc comment anticipated. Where `info`
+// answers "is the CLI pointing at the right cluster?", `doctor` answers "is
+// this running cluster healthy enough to run an experiment, and if not, what
+// do I fix?" — a read-only, post-install health sweep with remedies
+// (epic client-runtime#116, WS3).
+//
+// The three kubeconfig flags match `cluster info` exactly so muscle memory
+// carries over; all are zero-value-safe.
+func newClusterDoctorCmd() *cobra.Command {
+	var (
+		kubeconfigPath  string
+		contextOverride string
+		nsOverride      string
+	)
+
+	cmd := &cobra.Command{
+		Use:   "doctor",
+		Short: "Diagnose a running tracebloc client cluster (✔/⚠/✖ health checks + remedies)",
+		Long: `Runs a read-only health sweep over the tracebloc client release in the
+configured cluster + namespace and prints a ✔/⚠/✖ line per check with a
+remedy for anything that isn't green:
+
+  • Cluster reachable — the API answers and the client chart is installed
+  • Pod health — nothing crash-looping or stuck Pending
+  • Dataset volume — the shared PVC exists and is Bound
+  • Proxy configuration — the in-cluster requests/egress proxy wiring
+  • Backend egress — the tracebloc backend is reachable (from this machine)
+  • Service Bus egress — the requests-proxy that brokers experiment egress is Ready
+
+For a full redacted support bundle to send to tracebloc, use the installer's
+` + "`./install-k8s.sh --diagnose`" + ` instead.
+
+Exit codes:
+  0   all checks passed (or warnings only)
+  2   one or more checks failed
+  3   kubeconfig could not be loaded`,
+		Args: cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runClusterDoctor(
+				cmd.Context(),
+				printerFor(cmd),
+				kubeconfigPath, contextOverride, nsOverride,
+			)
+		},
+	}
+
+	cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "",
+		"path to the kubeconfig file (default: $KUBECONFIG, then ~/.kube/config)")
+	cmd.Flags().StringVar(&contextOverride, "context", "",
+		"name of the kubeconfig context to use (default: kubeconfig's current-context)")
+	cmd.Flags().StringVarP(&nsOverride, "namespace", "n", "",
+		"namespace where the parent tracebloc/client release is installed (default: the context's namespace, or 'default')")
+
+	return cmd
+}
+
+func runClusterDoctor(
+	ctx context.Context,
+	p *ui.Printer,
+	kubeconfigPath, contextOverride, nsOverride string,
+) error {
+	p.Banner("tracebloc", "cluster doctor")
+
+	resolved, err := cluster.Load(cluster.KubeconfigOptions{
+		Path:      kubeconfigPath,
+		Context:   contextOverride,
+		Namespace: nsOverride,
+	})
+	if err != nil {
+		// 3 = kubeconfig file/parse problem (same class as cluster info).
+		return &exitError{code: 3, err: fmt.Errorf("loading kubeconfig: %w", err)}
+	}
+
+	cs, err := cluster.NewClientset(resolved)
+	if err != nil {
+		return &exitError{code: 3, err: err}
+	}
+
+	p.Section("Kubeconfig")
+	p.Field("context", resolved.Context)
+	p.Field("server", resolved.ServerURL)
+	p.Field("namespace", resolved.Namespace)
+
+	results := doctor.Run(ctx, cs, doctor.Options{Namespace: resolved.Namespace})
+
+	p.Section("Checks")
+	for _, r := range results {
+		switch r.Status {
+		case doctor.StatusOK:
+			p.Successf("%s — %s", r.Name, r.Detail)
+		case doctor.StatusWarn:
+			p.Warnf("%s — %s", r.Name, r.Detail)
+			if r.Remedy != "" {
+				p.Hintf("     %s", r.Remedy)
+			}
+		case doctor.StatusFail:
+			p.Errorf("%s — %s", r.Name, r.Detail)
+			if r.Remedy != "" {
+				p.Hintf("     %s", r.Remedy)
+			}
+		}
+	}
+
+	p.Newline()
+	switch doctor.Worst(results) {
+	case doctor.StatusFail:
+		p.Errorf("Problems found — fix the ✖ items above.")
+		p.Hintf("For deeper triage, send tracebloc a support bundle: ./install-k8s.sh --diagnose")
+		// Silent (err == nil): the per-check lines above already explained it,
+		// so main() shouldn't print a redundant "Error:" line.
+		return &exitError{code: 2, err: nil}
+	case doctor.StatusWarn:
+		p.Warnf("Completed with warnings — review the ⚠ items above.")
+		return nil
+	default:
+		p.Successf("All checks passed — the cluster looks healthy.")
+		return nil
+	}
+}
diff --git a/internal/cli/interactive.go b/internal/cli/interactive.go
index 3c567bf..bb6478d 100644
--- a/internal/cli/interactive.go
+++ b/internal/cli/interactive.go
@@ -16,20 +16,12 @@ import (
 )
 
 // promptCategories is the ordered list offered by the interactive
-// category picker — the categories `dataset push` supports today (the
-// same set runDatasetPush's category gate accepts). semantic_ /
-// instance_segmentation are omitted until they're implemented.
-var promptCategories = []string{
-	"image_classification",
-	"object_detection",
-	"keypoint_detection",
-	"text_classification",
-	"masked_language_modeling",
-	"tabular_classification",
-	"tabular_regression",
-	"time_series_forecasting",
-	"time_to_event_prediction",
-}
+// category picker. It derives from the push registry's CLI-supported
+// set — the exact categories runDatasetPush's gate accepts — so the
+// picker can't drift from what `dataset push` actually supports.
+// semantic_/instance_segmentation are excluded (CLISupported=false)
+// until they're implemented.
+var promptCategories = push.SupportedCategoryIDs()
 
 // prompter is the narrow seam over the interactive library. Production
 // uses surveyPrompter (a real terminal); tests inject a fake that
diff --git a/internal/cli/root.go b/internal/cli/root.go
index f1047d6..0bf6f8c 100644
--- a/internal/cli/root.go
+++ b/internal/cli/root.go
@@ -81,6 +81,11 @@ what's planned next.`,
 	root.AddCommand(newIngestCmd())
 	root.AddCommand(newClusterCmd())
 	root.AddCommand(newDatasetCmd())
+	// RFC-0001 (backend#830): browser sign-in + client provisioning.
+	root.AddCommand(newLoginCmd())
+	root.AddCommand(newLogoutCmd())
+	root.AddCommand(newAuthCmd())
+	root.AddCommand(newClientCmd())
 
 	// Bare `tracebloc` (no subcommand) renders a friendly home screen
 	// instead of cobra's raw usage dump. Subcommands and --help are
diff --git a/internal/cluster/identity.go b/internal/cluster/identity.go
new file mode 100644
index 0000000..f33f14c
--- /dev/null
+++ b/internal/cluster/identity.go
@@ -0,0 +1,50 @@
+package cluster
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+)
+
+// clusterIDReadTimeout bounds the best-effort anchor read in ClusterID: a
+// kubeconfig pointing at an unreachable API server would otherwise hang the
+// kube-system GET for the OS TCP timeout, stalling a `client create` that is
+// meant to degrade to a non-anchored mint instead.
+const clusterIDReadTimeout = 8 * time.Second
+
+// ClusterID reads the kube-system namespace UID — the stable per-cluster
+// fingerprint RFC-0001 keys client idempotency on (§6.3 / §7.2; backend#883).
+// It needs a reachable cluster and RBAC to GET namespaces/kube-system. Callers
+// treat a failure as "couldn't read the cluster identity" and fall back to a
+// non-anchored (dual-mode) provision — they must not block on it.
+func ClusterID(ctx context.Context, opts KubeconfigOptions) (string, error) {
+	rc, err := Load(opts)
+	if err != nil {
+		return "", err
+	}
+	// Best-effort read — callers must not block on it (see the doc above), so cap
+	// it; otherwise an unreachable API server hangs the GET below.
+	rc.RestConfig.Timeout = clusterIDReadTimeout
+	cs, err := NewClientset(rc)
+	if err != nil {
+		return "", err
+	}
+	return clusterIDFrom(ctx, cs)
+}
+
+// clusterIDFrom reads the kube-system UID from a clientset. Split out so it can be
+// exercised with a fake clientset without a real cluster.
+func clusterIDFrom(ctx context.Context, cs kubernetes.Interface) (string, error) {
+	ns, err := cs.CoreV1().Namespaces().Get(ctx, "kube-system", metav1.GetOptions{})
+	if err != nil {
+		return "", fmt.Errorf("reading kube-system namespace UID: %w", err)
+	}
+	uid := string(ns.UID)
+	if uid == "" {
+		return "", fmt.Errorf("kube-system namespace has no UID")
+	}
+	return uid, nil
+}
diff --git a/internal/cluster/identity_test.go b/internal/cluster/identity_test.go
new file mode 100644
index 0000000..63c6d46
--- /dev/null
+++ b/internal/cluster/identity_test.go
@@ -0,0 +1,30 @@
+package cluster
+
+import (
+	"context"
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+func TestClusterIDFrom(t *testing.T) {
+	cs := fake.NewClientset(&corev1.Namespace{
+		ObjectMeta: metav1.ObjectMeta{Name: "kube-system", UID: "uid-abc-123"},
+	})
+	got, err := clusterIDFrom(context.Background(), cs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got != "uid-abc-123" {
+		t.Errorf("cluster id = %q, want uid-abc-123", got)
+	}
+}
+
+func TestClusterIDFrom_NoNamespace(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster — no kube-system
+	if _, err := clusterIDFrom(context.Background(), cs); err == nil {
+		t.Error("expected an error when kube-system is absent")
+	}
+}
diff --git a/internal/config/config.go b/internal/config/config.go
new file mode 100644
index 0000000..5145fad
--- /dev/null
+++ b/internal/config/config.go
@@ -0,0 +1,122 @@
+// Package config persists the tracebloc CLI's user state — the backend
+// environment, the user token from `tracebloc login`, and the active client
+// for this machine — at ~/.tracebloc/config.json (mode 0600, it holds a
+// token). RFC-0001 (backend#830).
+package config
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+)
+
+// Config is the on-disk CLI state. Every field is omitempty so a
+// partially-configured file stays small and forward-compatible.
+type Config struct {
+	Env            string `json:"env,omitempty"`              // dev|stg|prod (mirrors CLIENT_ENV)
+	Email          string `json:"email,omitempty"`            // who is signed in (display only)
+	Token          string `json:"token,omitempty"`            // user token from device login
+	ActiveClientID string `json:"active_client_id,omitempty"` // client this machine enrolls as
+}
+
+// Dir is the config directory: $TRACEBLOC_CONFIG_DIR if set (tests / ops
+// override), else ~/.tracebloc.
+func Dir() (string, error) {
+	if d := os.Getenv("TRACEBLOC_CONFIG_DIR"); d != "" {
+		return d, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", fmt.Errorf("locating home directory: %w", err)
+	}
+	return filepath.Join(home, ".tracebloc"), nil
+}
+
+// Path is the config file path (Dir()/config.json).
+func Path() (string, error) {
+	dir, err := Dir()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Join(dir, "config.json"), nil
+}
+
+// Load reads the config. A missing file is NOT an error — it returns an empty
+// Config (a fresh machine that has never run `login`).
+func Load() (*Config, error) {
+	path, err := Path()
+	if err != nil {
+		return nil, err
+	}
+	data, err := os.ReadFile(path)
+	if errors.Is(err, fs.ErrNotExist) {
+		return &Config{}, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("reading %s: %w", path, err)
+	}
+	var c Config
+	if err := json.Unmarshal(data, &c); err != nil {
+		return nil, fmt.Errorf("parsing %s: %w", path, err)
+	}
+	return &c, nil
+}
+
+// Save writes the config 0600 (creating the dir 0700), atomically: a temp file
+// in the same dir then a rename, so a crash mid-write can't truncate the token.
+func (c *Config) Save() error {
+	dir, err := Dir()
+	if err != nil {
+		return err
+	}
+	if err := os.MkdirAll(dir, 0o700); err != nil {
+		return fmt.Errorf("creating %s: %w", dir, err)
+	}
+	data, err := json.MarshalIndent(c, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encoding config: %w", err)
+	}
+	data = append(data, '\n')
+
+	tmp, err := os.CreateTemp(dir, ".config-*.json")
+	if err != nil {
+		return fmt.Errorf("creating temp config: %w", err)
+	}
+	tmpName := tmp.Name()
+	defer func() { _ = os.Remove(tmpName) }() // no-op after a successful rename
+	if err := tmp.Chmod(0o600); err != nil {
+		_ = tmp.Close()
+		return fmt.Errorf("chmod temp config: %w", err)
+	}
+	if _, err := tmp.Write(data); err != nil {
+		_ = tmp.Close()
+		return fmt.Errorf("writing temp config: %w", err)
+	}
+	if err := tmp.Close(); err != nil {
+		return fmt.Errorf("closing temp config: %w", err)
+	}
+	path, _ := Path()
+	if err := os.Rename(tmpName, path); err != nil {
+		return fmt.Errorf("replacing %s: %w", path, err)
+	}
+	return nil
+}
+
+// Clear removes the config file (full sign-out + reset). A missing file is not
+// an error.
+func Clear() error {
+	path, err := Path()
+	if err != nil {
+		return err
+	}
+	if err := os.Remove(path); err != nil && !errors.Is(err, fs.ErrNotExist) {
+		return fmt.Errorf("removing %s: %w", path, err)
+	}
+	return nil
+}
+
+// SignedIn reports whether a user token is stored.
+func (c *Config) SignedIn() bool { return c.Token != "" }
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
new file mode 100644
index 0000000..53dd368
--- /dev/null
+++ b/internal/config/config_test.go
@@ -0,0 +1,64 @@
+package config
+
+import (
+	"os"
+	"testing"
+)
+
+func TestSaveLoadRoundTrip(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	in := &Config{Env: "dev", Email: "a@b.com", Token: "tok123", ActiveClientID: "edge_1"}
+	if err := in.Save(); err != nil {
+		t.Fatal(err)
+	}
+	out, err := Load()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if *out != *in {
+		t.Fatalf("round-trip mismatch: %+v != %+v", out, in)
+	}
+	if !out.SignedIn() {
+		t.Error("SignedIn should be true when a token is present")
+	}
+}
+
+func TestSaveIs0600(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	if err := (&Config{Token: "secret"}).Save(); err != nil {
+		t.Fatal(err)
+	}
+	p, _ := Path()
+	fi, err := os.Stat(p)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if fi.Mode().Perm() != 0o600 {
+		t.Errorf("config mode = %v, want 0600 (it holds a token)", fi.Mode().Perm())
+	}
+}
+
+func TestLoadMissingIsEmpty(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	c, err := Load()
+	if err != nil {
+		t.Fatalf("missing config should not error: %v", err)
+	}
+	if c.SignedIn() {
+		t.Errorf("missing config should be empty, got %+v", c)
+	}
+}
+
+func TestClear(t *testing.T) {
+	t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir())
+	_ = (&Config{Token: "x"}).Save()
+	if err := Clear(); err != nil {
+		t.Fatal(err)
+	}
+	if c, _ := Load(); c.SignedIn() {
+		t.Error("after Clear, should not be signed in")
+	}
+	if err := Clear(); err != nil {
+		t.Errorf("Clear on a missing file should be nil, got %v", err)
+	}
+}
diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go
new file mode 100644
index 0000000..d69ce9c
--- /dev/null
+++ b/internal/doctor/doctor.go
@@ -0,0 +1,602 @@
+// Package doctor implements the checks behind `tracebloc cluster doctor`:
+// a read-only, best-effort health sweep of a running tracebloc client
+// cluster. Each check reports ✔/⚠/✖ plus a one-line remedy, so a customer
+// can answer "why isn't my experiment running?" without tracebloc shelling
+// into their cluster (epic tracebloc/client-runtime#116, WS3).
+//
+// Design mirrors the installer's preflight.sh: every check is independent
+// and returns a Result instead of aborting, so one failure never hides the
+// others. Network probes are injectable (Options) so the package is fully
+// exercisable with client-go's fake clientset — no real cluster or egress.
+package doctor
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"sort"
+	"strings"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+
+	"github.com/tracebloc/cli/internal/cluster"
+)
+
+// Status is a single check's severity. Ordered so the numerically-greatest
+// status is the worst — see Worst.
+type Status int
+
+const (
+	StatusOK Status = iota
+	StatusWarn
+	StatusFail
+)
+
+func (s Status) String() string {
+	switch s {
+	case StatusOK:
+		return "ok"
+	case StatusWarn:
+		return "warn"
+	case StatusFail:
+		return "fail"
+	default:
+		return "unknown"
+	}
+}
+
+// Result is one check's outcome. Remedy is shown to the customer only when
+// Status is not OK.
+type Result struct {
+	Name   string
+	Status Status
+	Detail string
+	Remedy string
+}
+
+// Worst returns the most severe status across results (StatusOK for none).
+// The doctor command maps a non-OK worst to a non-zero exit code.
+func Worst(results []Result) Status {
+	worst := StatusOK
+	for _, r := range results {
+		if r.Status > worst {
+			worst = r.Status
+		}
+	}
+	return worst
+}
+
+// Conservative tunables, kept as package consts to avoid false positives on a
+// busy cluster. If a check ever needs runtime tuning, thread it through Options
+// (like HTTPProbe) rather than making these package vars.
+const (
+	pendingGrace     = 5 * time.Minute // a pod Pending longer than this is flagged
+	httpProbeTimeout = 8 * time.Second
+)
+
+// Options configures a diagnosis run. The zero value is usable: Namespace
+// defaults are the caller's concern (it passes the resolved namespace), and
+// HTTPProbe falls back to the real proxy-aware prober.
+type Options struct {
+	Namespace string
+
+	// HTTPProbe reports whether a URL is reachable from where the CLI runs.
+	// nil => httpProbe (proxy-aware, short timeout). Injected in tests.
+	HTTPProbe func(ctx context.Context, url string) error
+}
+
+// Run executes every check in display order and returns their results. It
+// never returns an error: an unreachable cluster or a failing probe is a
+// Result, not a Go error — the command renders all of them and derives the
+// exit code from Worst.
+func Run(ctx context.Context, cs kubernetes.Interface, opts Options) []Result {
+	if opts.HTTPProbe == nil {
+		opts.HTTPProbe = httpProbe
+	}
+	ns := opts.Namespace
+
+	// Discovered once and shared: the parent release gates nothing (every
+	// check still runs and reports), but the later checks reuse it.
+	release, relErr := cluster.DiscoverParentRelease(ctx, cs, ns)
+	jmEnv := jobsManagerEnv(ctx, cs, ns, release)
+
+	return []Result{
+		checkReachable(release, relErr, ns),
+		checkPods(ctx, cs, ns),
+		checkPVC(ctx, cs, ns),
+		checkNodeFit(ctx, cs, jmEnv),
+		checkImagePull(ctx, cs, ns, release),
+		checkProxy(jmEnv),
+		checkBackendEgress(ctx, jmEnv, opts.HTTPProbe),
+		checkRequestsProxy(ctx, cs, ns, release),
+	}
+}
+
+// checkReachable confirms the API answered and the parent client chart is
+// installed here. It's the gate the customer reads first; the rest still run.
+func checkReachable(release *cluster.ParentRelease, err error, ns string) Result {
+	const name = "Cluster reachable"
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: err.Error(),
+			Remedy: "Check your kubeconfig/context and that the tracebloc client chart is installed here: kubectl get deploy -n " + ns,
+		}
+	}
+	return Result{
+		Name:   name,
+		Status: StatusOK,
+		Detail: fmt.Sprintf("release %q, chart %s, appVersion %s (namespace %s)",
+			release.ReleaseName, release.ChartVersion, release.AppVersion, ns),
+	}
+}
+
+// checkPods flags crash-looping or long-Pending pods — the local complement
+// to the controller's crash-loop detection (client-runtime#117). Conservative
+// thresholds keep a transient restart or a briefly-Pending job from tripping it.
+func checkPods(ctx context.Context, cs kubernetes.Interface, ns string) Result {
+	const name = "Pod health"
+	pods, err := cs.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "could not list pods: " + err.Error(),
+			Remedy: "Ensure your kubeconfig user can list pods in " + ns + ".",
+		}
+	}
+
+	var crashing, pending []string
+	for _, p := range pods.Items {
+		if podCrashLooping(p) {
+			crashing = append(crashing, p.Name)
+			continue
+		}
+		if p.Status.Phase == corev1.PodPending &&
+			time.Since(p.CreationTimestamp.Time) > pendingGrace {
+			pending = append(pending, p.Name)
+		}
+	}
+	sort.Strings(crashing)
+	sort.Strings(pending)
+
+	switch {
+	case len(crashing) > 0:
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("crash-looping: %v", crashing),
+			Remedy: "Inspect the container's own logs: kubectl logs -n " + ns + " <pod> --previous",
+		}
+	case len(pending) > 0:
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: fmt.Sprintf("Pending > %s: %v", pendingGrace, pending),
+			Remedy: "kubectl describe pod -n " + ns + " <pod> — usually unschedulable (resources) or an image pull issue.",
+		}
+	default:
+		return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d pod(s), none crash-looping or stuck Pending", len(pods.Items))}
+	}
+}
+
+// podCrashLooping reports whether a pod has a container actively stuck in
+// CrashLoopBackOff — the state Kubernetes sets for a container that keeps
+// crashing. Both init AND app containers are checked: an init container in
+// CrashLoopBackOff keeps the pod Pending and blocks startup silently (Bugbot
+// on PR #89).
+//
+// We deliberately do NOT infer crash-looping from RestartCount: a high count
+// is equally produced by a pod that recovered on retry, a job that retried
+// before Succeeding, or a completed init container — all healthy (Bugbot on
+// PR #89; cf. the controller's recovered-container fix, client-runtime#117).
+// The terminal-phase guard is belt-and-suspenders: a Succeeded/Failed pod has
+// no waiting containers anyway.
+func podCrashLooping(p corev1.Pod) bool {
+	if p.Status.Phase == corev1.PodSucceeded || p.Status.Phase == corev1.PodFailed {
+		return false
+	}
+	for _, group := range [][]corev1.ContainerStatus{p.Status.InitContainerStatuses, p.Status.ContainerStatuses} {
+		for _, c := range group {
+			if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// checkPVC reuses cluster.DiscoverSharedPVC — which already verifies the
+// shared-data PVC exists and is Bound, with actionable errors.
+func checkPVC(ctx context.Context, cs kubernetes.Interface, ns string) Result {
+	const name = "Dataset volume (PVC)"
+	pvc, err := cluster.DiscoverSharedPVC(ctx, cs, ns)
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: err.Error(),
+			Remedy: "Check the cluster has a usable StorageClass (kubectl get sc) and the PVC is Bound (kubectl get pvc -n " + ns + ").",
+		}
+	}
+	return Result{
+		Name:   name,
+		Status: StatusOK,
+		Detail: fmt.Sprintf("%s Bound, mounted at %s", pvc.ClaimName, pvc.MountPath),
+	}
+}
+
+// checkProxy surfaces the in-cluster proxy wiring read from jobs-manager's
+// env — the corporate-proxy propagation that, when missing, silently strands
+// egress. Informational: present config is ✔; a missing requests-proxy URL is
+// the only genuine anomaly.
+func checkProxy(env map[string]string) Result {
+	const name = "Proxy configuration"
+	rp := env["REQUESTS_PROXY_URL"]
+	if rp == "" {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			// jobsManagerEnv reads only literal env values, so a chart that sets
+			// REQUESTS_PROXY_URL via a configMap/secret ref reads as empty here —
+			// called out in the remedy so a ref-based install isn't mistaken for
+			// missing wiring.
+			Detail: "jobs-manager has no literal REQUESTS_PROXY_URL (chart too old, or it's set via a configMap/secret ref)",
+			Remedy: "Verify the requests-proxy is wired: kubectl set env deploy/<release>-jobs-manager --list | grep PROXY",
+		}
+	}
+	detail := "requests-proxy=" + rp
+	if eg := env["EGRESS_PROXY_URL"]; eg != "" {
+		detail += ", egress-proxy=" + eg
+	}
+	if env["HTTPS_PROXY"] != "" || env["HTTP_PROXY"] != "" {
+		detail += ", corporate HTTP(S)_PROXY set"
+	} else {
+		detail += ", no corporate HTTP(S)_PROXY"
+	}
+	return Result{Name: name, Status: StatusOK, Detail: detail}
+}
+
+// checkBackendEgress probes the tracebloc backend API. Honest scope: this
+// runs from the machine the CLI is on, NOT from inside the cluster — the
+// cluster egresses via its egress-proxy. An in-cluster probe is the WS3
+// follow-up; this still catches a customer network/proxy that can't reach
+// the backend at all.
+func checkBackendEgress(ctx context.Context, env map[string]string, probe func(context.Context, string) error) Result {
+	const name = "Backend egress (from this machine)"
+	host := backendHost(env["CLIENT_ENV"])
+	url := "https://" + host + "/"
+	if err := probe(ctx, url); err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("%s unreachable: %v", host, err),
+			Remedy: "Check this machine's network/proxy to " + host + ". The cluster egresses via its egress-proxy, so this is indicative, not definitive.",
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: host + " reachable"}
+}
+
+// backendHost maps CLIENT_ENV to the backend API host, mirroring the edge
+// runtime's own mapping (controller.py). Unset/unknown defaults to prod, the
+// chart's CLIENT_ENV default.
+func backendHost(clientEnv string) string {
+	// Normalize like the API client (api.ResolveEnv/BaseURL lower-case), so a
+	// non-lowercase CLIENT_ENV on the edge box doesn't fall through to prod and
+	// make the doctor probe the wrong backend.
+	switch strings.ToLower(strings.TrimSpace(clientEnv)) {
+	case "dev":
+		return "dev-api.tracebloc.io"
+	case "stg":
+		return "stg-api.tracebloc.io"
+	default:
+		return "api.tracebloc.io"
+	}
+}
+
+// checkRequestsProxy verifies the requests-proxy deployment — the in-cluster
+// broker for experiment egress (the Service Bus "experiments" queue) — is
+// present and Ready. While it's down, experiments egress fails and the
+// experiment silently stays Pending, the exact class this epic targets.
+func checkRequestsProxy(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result {
+	const name = "Service Bus egress (requests-proxy)"
+	dep := findDeployment(ctx, cs, ns, release, "requests-proxy")
+	if dep == nil {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: "requests-proxy deployment not found",
+			Remedy: "The requests-proxy brokers experiment (Service Bus) egress; without it experiments stay Pending. Reinstall/upgrade the client chart.",
+		}
+	}
+	if dep.Status.ReadyReplicas < 1 {
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("requests-proxy not ready (%d/%d replicas)", dep.Status.ReadyReplicas, dep.Status.Replicas),
+			Remedy: "Experiments egress flows through requests-proxy; while it's down they stay Pending. kubectl describe deploy " + dep.Name + " -n " + ns,
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: "requests-proxy ready (brokers the 'experiments' queue)"}
+}
+
+// checkNodeFit verifies at least one Ready node can satisfy the resource
+// requests the jobs-manager stamps on spawned training jobs (RESOURCE_REQUESTS
+// / GPU_REQUESTS env) — the "Pending forever, no node big enough" class. GPU is
+// soft: when a GPU is requested but no node exposes it, that's a ⚠ (jobs-manager
+// has a GPU→CPU fallback), not a hard failure.
+func checkNodeFit(ctx context.Context, cs kubernetes.Interface, env map[string]string) Result {
+	const name = "Node capacity"
+	cpuReq, memReq, ok := parseCPUMem(env["RESOURCE_REQUESTS"])
+	if !ok {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "couldn't read RESOURCE_REQUESTS from jobs-manager — skipping node-fit",
+			Remedy: "kubectl set env deploy/<release>-jobs-manager --list | grep RESOURCE_REQUESTS",
+		}
+	}
+	gpuName, gpuReq, gpuRequested := parseGPU(env["GPU_REQUESTS"])
+
+	nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "could not list nodes: " + err.Error(),
+			Remedy: "Ensure your kubeconfig user can list nodes.",
+		}
+	}
+
+	req := fmt.Sprintf("cpu=%s, memory=%s", cpuReq.String(), memReq.String())
+	if gpuRequested {
+		req += fmt.Sprintf(", %s=%s", gpuName, gpuReq.String())
+	}
+
+	// A pod gets ALL its requested resources from ONE node, so evaluate each
+	// node as a whole — never OR cpu/mem and GPU across different nodes, which
+	// would pass even when no single node can run the job (Bugbot on PR #91).
+	var cpuMemFits, fullFits bool
+	for i := range nodes.Items {
+		n := nodes.Items[i]
+		if !nodeReady(n) {
+			continue
+		}
+		alloc := n.Status.Allocatable
+		nodeCPUMem := alloc.Cpu().Cmp(cpuReq) >= 0 && alloc.Memory().Cmp(memReq) >= 0
+		nodeGPU := !gpuRequested
+		if gpuRequested {
+			if q, present := alloc[gpuName]; present && q.Cmp(gpuReq) >= 0 {
+				nodeGPU = true
+			}
+		}
+		if nodeCPUMem {
+			cpuMemFits = true
+		}
+		if nodeCPUMem && nodeGPU {
+			fullFits = true
+		}
+	}
+
+	switch {
+	case !cpuMemFits:
+		return Result{
+			Name:   name,
+			Status: StatusFail,
+			Detail: fmt.Sprintf("no Ready node can fit a training job (needs %s)", req),
+			Remedy: "Add/resize a node to meet the job's requests, or lower RESOURCE_REQUESTS on jobs-manager.",
+		}
+	case gpuRequested && !fullFits:
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: fmt.Sprintf("no single Ready node satisfies cpu+memory AND %s — GPU jobs rely on the CPU fallback (needs %s)", gpuName, req),
+			Remedy: "If GPU training is expected, ensure one node has both the compute and the GPU capacity, with its device plugin.",
+		}
+	default:
+		return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("a Ready node can schedule a training job (%s)", req)}
+	}
+}
+
+// checkImagePull verifies that any registry pull secret the jobs-manager
+// references exists and is a well-formed dockerconfigjson — so private-image
+// pulls don't ImagePullBackOff. (Bad-but-well-formed credentials can't be
+// detected without an actual pull; this catches a missing/empty/malformed
+// secret, the common misconfig.)
+func checkImagePull(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result {
+	const name = "Image pull secret"
+	dep := findDeployment(ctx, cs, ns, release, "jobs-manager")
+	if dep == nil {
+		return Result{
+			Name:   name,
+			Status: StatusWarn,
+			Detail: "couldn't read jobs-manager to resolve image pull secrets — skipping",
+			Remedy: "Check the parent client release is installed in " + ns + ".",
+		}
+	}
+	secrets := dep.Spec.Template.Spec.ImagePullSecrets
+	if len(secrets) == 0 {
+		return Result{Name: name, Status: StatusOK, Detail: "no image pull secret in use (public/digest-pinned images)"}
+	}
+	for _, ref := range secrets {
+		sec, err := cs.CoreV1().Secrets(ns).Get(ctx, ref.Name, metav1.GetOptions{})
+		if err != nil {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("image pull secret %q not found", ref.Name),
+				Remedy: "Private image pulls will ImagePullBackOff. Reinstall the chart with valid registry credentials.",
+			}
+		}
+		if sec.Type != corev1.SecretTypeDockerConfigJson {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("secret %q is type %q, not %s", ref.Name, sec.Type, corev1.SecretTypeDockerConfigJson),
+				Remedy: "Recreate it as a docker-registry secret (kubectl create secret docker-registry).",
+			}
+		}
+		if data := sec.Data[corev1.DockerConfigJsonKey]; len(data) == 0 || !json.Valid(data) {
+			return Result{
+				Name:   name,
+				Status: StatusFail,
+				Detail: fmt.Sprintf("secret %q has an empty or malformed %s", ref.Name, corev1.DockerConfigJsonKey),
+				Remedy: "Recreate the registry secret; its .dockerconfigjson isn't valid JSON.",
+			}
+		}
+	}
+	return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d image pull secret(s) present and well-formed", len(secrets))}
+}
+
+// parseResourceSpec parses jobs-manager's "k1=v1,k2=v2" resource env into a map.
+func parseResourceSpec(spec string) map[string]string {
+	out := map[string]string{}
+	for _, part := range strings.Split(spec, ",") {
+		kv := strings.SplitN(strings.TrimSpace(part), "=", 2)
+		if len(kv) == 2 && strings.TrimSpace(kv[0]) != "" {
+			out[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
+		}
+	}
+	return out
+}
+
+// parseCPUMem extracts the cpu + memory quantities from a RESOURCE_REQUESTS
+// spec; ok is false unless both are present and parseable.
+func parseCPUMem(spec string) (cpu, mem resource.Quantity, ok bool) {
+	m := parseResourceSpec(spec)
+	c, cOK := m["cpu"]
+	mm, mOK := m["memory"]
+	if !cOK || !mOK {
+		return resource.Quantity{}, resource.Quantity{}, false
+	}
+	cpu, errC := resource.ParseQuantity(c)
+	mem, errM := resource.ParseQuantity(mm)
+	if errC != nil || errM != nil {
+		return resource.Quantity{}, resource.Quantity{}, false
+	}
+	return cpu, mem, true
+}
+
+// parseGPU extracts the GPU resource name + quantity from a GPU_REQUESTS spec
+// (e.g. "nvidia.com/gpu=1"). requested is false when absent, unparseable, or 0.
+func parseGPU(spec string) (name corev1.ResourceName, qty resource.Quantity, requested bool) {
+	for k, v := range parseResourceSpec(spec) {
+		q, err := resource.ParseQuantity(v)
+		if err == nil && !q.IsZero() {
+			return corev1.ResourceName(k), q, true
+		}
+	}
+	return "", resource.Quantity{}, false
+}
+
+// nodeReady reports whether a node's Ready condition is True.
+func nodeReady(n corev1.Node) bool {
+	for _, c := range n.Status.Conditions {
+		if c.Type == corev1.NodeReady {
+			return c.Status == corev1.ConditionTrue
+		}
+	}
+	return false
+}
+
+// jobsManagerEnv reads jobs-manager's first-container plain env into a map
+// (valueFrom entries have no literal value and are skipped). Best-effort:
+// returns an empty map when the deployment can't be fetched.
+func jobsManagerEnv(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) map[string]string {
+	env := map[string]string{}
+	dep := findDeployment(ctx, cs, ns, release, "jobs-manager")
+	if dep == nil || len(dep.Spec.Template.Spec.Containers) == 0 {
+		return env
+	}
+	for _, e := range dep.Spec.Template.Spec.Containers[0].Env {
+		if e.Value != "" {
+			env[e.Name] = e.Value
+		}
+	}
+	return env
+}
+
+// getDeployment returns the first of candidates that exists, or nil.
+func getDeployment(ctx context.Context, cs kubernetes.Interface, ns string, candidates []string) *appsv1.Deployment {
+	for _, n := range candidates {
+		if n == "" {
+			continue
+		}
+		d, err := cs.AppsV1().Deployments(ns).Get(ctx, n, metav1.GetOptions{})
+		if err == nil {
+			return d
+		}
+	}
+	return nil
+}
+
+// findDeployment locates a chart component's Deployment ("<suffix>", e.g.
+// "requests-proxy"), tied to the discovered release so a check can never be
+// satisfied by a DIFFERENT release's component or a stray bare one (Bugbot on
+// PR #89).
+//
+// Release known: take the chart's standard "<release>-<suffix>" name, or a bare
+// "<suffix>" ONLY when its app.kubernetes.io/instance label ties it to this
+// release (older unprefixed charts). A deployment belonging to another release
+// is never accepted — if this release's component is missing, return nil and let
+// the check report it.
+//
+// Release unknown (discovery failed): match by name suffix, but only when
+// EXACTLY ONE deployment carries it. With several (multiple releases, which
+// DiscoverParentRelease refuses to disambiguate) there's no safe attribution, so
+// return nil rather than guess — which would let different checks describe
+// different releases in one run.
+func findDeployment(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease, suffix string) *appsv1.Deployment {
+	if release != nil && release.ReleaseName != "" {
+		if d := getDeployment(ctx, cs, ns, []string{release.ReleaseName + "-" + suffix}); d != nil {
+			return d
+		}
+		if d := getDeployment(ctx, cs, ns, []string{suffix}); d != nil &&
+			d.Labels["app.kubernetes.io/instance"] == release.ReleaseName {
+			return d
+		}
+		return nil
+	}
+
+	deps, err := cs.AppsV1().Deployments(ns).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil
+	}
+	var match *appsv1.Deployment
+	for i := range deps.Items {
+		if n := deps.Items[i].Name; n == suffix || strings.HasSuffix(n, "-"+suffix) {
+			if match != nil {
+				return nil // ambiguous across releases — don't guess
+			}
+			match = &deps.Items[i]
+		}
+	}
+	return match
+}
+
+// httpProbe is the default backend prober: a GET with a short timeout over
+// Go's default transport, which honors HTTP(S)_PROXY/NO_PROXY via
+// http.ProxyFromEnvironment. Any HTTP response (even 4xx/5xx) means the host
+// is reachable — we're testing connectivity, not the endpoint's health.
+func httpProbe(ctx context.Context, url string) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return err
+	}
+	client := &http.Client{Timeout: httpProbeTimeout}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+	// Connected — the host is reachable regardless of status code. Discard the
+	// close error so a rare post-connect close failure isn't reported as "down".
+	_ = resp.Body.Close()
+	return nil
+}
diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go
new file mode 100644
index 0000000..7eaaa22
--- /dev/null
+++ b/internal/doctor/doctor_test.go
@@ -0,0 +1,501 @@
+package doctor
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"testing"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+
+	"github.com/tracebloc/cli/internal/cluster"
+)
+
+const ns = "tracebloc"
+
+func bg() context.Context { return context.Background() }
+
+// jobsManagerDep mirrors the chart labels DiscoverParentRelease keys off
+// (see internal/cluster/discover_test.go) so the fake clientset discovers it.
+func jobsManagerDep(release string, env ...corev1.EnvVar) *appsv1.Deployment {
+	return &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      release + "-jobs-manager",
+			Namespace: ns,
+			Labels: map[string]string{
+				"app.kubernetes.io/name":       "client",
+				"app.kubernetes.io/instance":   release,
+				"app.kubernetes.io/managed-by": "Helm",
+				"app.kubernetes.io/version":    "1.3.5",
+				"helm.sh/chart":                "client-1.3.5",
+			},
+		},
+		Spec: appsv1.DeploymentSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{Name: "api", Env: env}},
+				},
+			},
+		},
+	}
+}
+
+func requestsProxyDep(release string, ready int32) *appsv1.Deployment {
+	return &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{Name: release + "-requests-proxy", Namespace: ns},
+		Status:     appsv1.DeploymentStatus{Replicas: 1, ReadyReplicas: ready},
+	}
+}
+
+func boundPVC() *corev1.PersistentVolumeClaim {
+	return &corev1.PersistentVolumeClaim{
+		ObjectMeta: metav1.ObjectMeta{Name: cluster.SharedPVCClaimName, Namespace: ns},
+		Spec:       corev1.PersistentVolumeClaimSpec{AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}},
+		Status:     corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound},
+	}
+}
+
+func runningPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase:             corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: 0}},
+		},
+	}
+}
+
+func crashPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{
+				Name:  "c",
+				State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+			}},
+		},
+	}
+}
+
+// succeededPod is a finished job pod that retried before completing — a high
+// RestartCount here is historical, not a current crash-loop (Bugbot on #89).
+func succeededPod(name string, restarts int32) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase:             corev1.PodSucceeded,
+			ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: restarts}},
+		},
+	}
+}
+
+// recoveredPod restarted several times but its container is running again now —
+// recovered, not crash-looping (cf. controller recovered-container fix, #117).
+func recoveredPod(name string, restarts int32) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodRunning,
+			ContainerStatuses: []corev1.ContainerStatus{{
+				Name:         "c",
+				RestartCount: restarts,
+				State:        corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+			}},
+		},
+	}
+}
+
+func pendingPod(name string, age time.Duration) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              name,
+			Namespace:         ns,
+			CreationTimestamp: metav1.NewTime(time.Now().Add(age)),
+		},
+		Status: corev1.PodStatus{Phase: corev1.PodPending},
+	}
+}
+
+// initCrashPod has an init container stuck in CrashLoopBackOff — the pod stays
+// Pending and cannot start, so it must read as a failure, not a Pending warning
+// (Bugbot on #89).
+func initCrashPod(name string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Status: corev1.PodStatus{
+			Phase: corev1.PodPending,
+			InitContainerStatuses: []corev1.ContainerStatus{{
+				Name:  "init",
+				State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+			}},
+		},
+	}
+}
+
+func TestWorst(t *testing.T) {
+	if got := Worst(nil); got != StatusOK {
+		t.Fatalf("Worst(nil) = %v, want ok", got)
+	}
+	rs := []Result{{Status: StatusOK}, {Status: StatusFail}, {Status: StatusWarn}}
+	if got := Worst(rs); got != StatusFail {
+		t.Fatalf("Worst = %v, want fail", got)
+	}
+}
+
+func TestCheckReachable(t *testing.T) {
+	if r := checkReachable(nil, errors.New("boom"), ns); r.Status != StatusFail {
+		t.Fatalf("error => %v, want fail", r.Status)
+	}
+	rel := &cluster.ParentRelease{ReleaseName: "tb", ChartVersion: "1.3.5", AppVersion: "1.3.5"}
+	r := checkReachable(rel, nil, ns)
+	if r.Status != StatusOK || !strings.Contains(r.Detail, "tb") {
+		t.Fatalf("release => %v / %q, want ok mentioning the release", r.Status, r.Detail)
+	}
+}
+
+func TestCheckPods(t *testing.T) {
+	tests := []struct {
+		name string
+		pod  *corev1.Pod
+		want Status
+	}{
+		{"healthy", runningPod("ok"), StatusOK},
+		{"crash-loop", crashPod("bad"), StatusFail},
+		{"pending-old", pendingPod("stuck", -10*time.Minute), StatusWarn},
+		{"pending-fresh", pendingPod("fresh", -time.Minute), StatusOK},
+		{"succeeded-high-restarts", succeededPod("done", 5), StatusOK},
+		{"recovered-running", recoveredPod("recovered", 5), StatusOK},
+		{"init-crash-loop", initCrashPod("initbad"), StatusFail},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			cs := fake.NewClientset(tc.pod)
+			if r := checkPods(bg(), cs, ns); r.Status != tc.want {
+				t.Fatalf("checkPods = %v (%q), want %v", r.Status, r.Detail, tc.want)
+			}
+		})
+	}
+}
+
+func TestCheckPVC(t *testing.T) {
+	if r := checkPVC(bg(), fake.NewClientset(boundPVC()), ns); r.Status != StatusOK {
+		t.Fatalf("bound PVC => %v, want ok", r.Status)
+	}
+	if r := checkPVC(bg(), fake.NewClientset(), ns); r.Status != StatusFail {
+		t.Fatalf("missing PVC => %v, want fail", r.Status)
+	}
+}
+
+func TestCheckProxy(t *testing.T) {
+	tests := []struct {
+		name   string
+		env    map[string]string
+		want   Status
+		substr string
+	}{
+		{"requests-proxy set", map[string]string{"REQUESTS_PROXY_URL": "http://requests-proxy-service:8888"}, StatusOK, "requests-proxy="},
+		{"corporate proxy", map[string]string{"REQUESTS_PROXY_URL": "http://x", "HTTPS_PROXY": "http://corp:3128"}, StatusOK, "corporate HTTP(S)_PROXY set"},
+		{"empty", map[string]string{}, StatusWarn, "REQUESTS_PROXY_URL"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			r := checkProxy(tc.env)
+			if r.Status != tc.want || !strings.Contains(r.Detail, tc.substr) {
+				t.Fatalf("checkProxy = %v / %q, want %v containing %q", r.Status, r.Detail, tc.want, tc.substr)
+			}
+		})
+	}
+}
+
+func TestCheckBackendEgress(t *testing.T) {
+	okProbe := func(context.Context, string) error { return nil }
+	failProbe := func(context.Context, string) error { return errors.New("dns failure") }
+
+	if r := checkBackendEgress(bg(), map[string]string{"CLIENT_ENV": "dev"}, okProbe); r.Status != StatusOK || !strings.Contains(r.Detail, "dev-api.tracebloc.io") {
+		t.Fatalf("reachable dev => %v / %q", r.Status, r.Detail)
+	}
+	if r := checkBackendEgress(bg(), map[string]string{}, failProbe); r.Status != StatusFail || !strings.Contains(r.Detail, "api.tracebloc.io") {
+		t.Fatalf("unreachable default => %v / %q", r.Status, r.Detail)
+	}
+}
+
+func TestBackendHost(t *testing.T) {
+	tests := map[string]string{
+		"dev":   "dev-api.tracebloc.io",
+		"stg":   "stg-api.tracebloc.io",
+		"prod":  "api.tracebloc.io",
+		"":      "api.tracebloc.io",
+		"weird": "api.tracebloc.io",
+		// Case/space-insensitive, matching the API client's env resolution — a
+		// non-lowercase CLIENT_ENV must not fall through to prod.
+		"DEV":   "dev-api.tracebloc.io",
+		"Stg":   "stg-api.tracebloc.io",
+		" dev ": "dev-api.tracebloc.io",
+	}
+	for in, want := range tests {
+		if got := backendHost(in); got != want {
+			t.Errorf("backendHost(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestCheckRequestsProxy(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "tb"}
+	tests := []struct {
+		name string
+		dep  *appsv1.Deployment // nil => deployment absent
+		want Status
+	}{
+		{"ready", requestsProxyDep("tb", 1), StatusOK},
+		{"not-ready", requestsProxyDep("tb", 0), StatusFail},
+		{"missing", nil, StatusFail},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			cs := fake.NewClientset()
+			if tc.dep != nil {
+				cs = fake.NewClientset(tc.dep)
+			}
+			if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != tc.want {
+				t.Fatalf("checkRequestsProxy = %v (%q), want %v", r.Status, r.Detail, tc.want)
+			}
+		})
+	}
+}
+
+// When DiscoverParentRelease failed (release nil) but a release-prefixed
+// requests-proxy exists, the suffix fallback must still find it rather than
+// falsely report it missing (Bugbot on #89).
+func TestCheckRequestsProxy_NilReleaseFindsPrefixed(t *testing.T) {
+	cs := fake.NewClientset(requestsProxyDep("tb", 1)) // "tb-requests-proxy"
+	if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status != StatusOK {
+		t.Fatalf("nil release with prefixed deploy => %v (%q), want ok", r.Status, r.Detail)
+	}
+}
+
+// With multiple parent releases in one namespace (the case DiscoverParentRelease
+// refuses) and no discovered release, the suffix fallback must NOT pick one
+// arbitrarily — guessing could let different checks describe different releases
+// in a single run (Bugbot on #89). It should report can't-determine, not OK.
+func TestCheckRequestsProxy_NilReleaseAmbiguous(t *testing.T) {
+	cs := fake.NewClientset(
+		requestsProxyDep("relA", 1), // "relA-requests-proxy"
+		requestsProxyDep("relB", 1), // "relB-requests-proxy"
+	)
+	if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status == StatusOK {
+		t.Fatalf("ambiguous multi-release => %v (%q), want not-OK (no guessing)", r.Status, r.Detail)
+	}
+}
+
+// With a release discovered, the check must be tied to THAT release: another
+// release's requests-proxy must not be accepted as the discovered release's,
+// or relA goes green on relB's proxy while relA's is actually missing
+// (Bugbot on #89).
+func TestCheckRequestsProxy_DiscoveredReleaseIgnoresOtherReleases(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "relA"} // relA has no requests-proxy
+	cs := fake.NewClientset(requestsProxyDep("relB", 1))
+	if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status == StatusOK {
+		t.Fatalf("relA proxy missing, relB present => %v (%q), want not-OK", r.Status, r.Detail)
+	}
+}
+
+// A bare (unprefixed) requests-proxy is accepted only when its instance label
+// ties it to the discovered release — covering older unprefixed charts.
+func TestCheckRequestsProxy_BareNameAcceptedWhenLabelledForRelease(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "relA"}
+	bare := requestsProxyDep("relA", 1)
+	bare.Name = "requests-proxy"
+	bare.Labels = map[string]string{"app.kubernetes.io/instance": "relA"}
+	cs := fake.NewClientset(bare)
+	if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != StatusOK {
+		t.Fatalf("bare requests-proxy labelled for relA => %v (%q), want ok", r.Status, r.Detail)
+	}
+}
+
+func TestRun_HealthyCluster(t *testing.T) {
+	const rel = "tb"
+	cs := fake.NewClientset(
+		jobsManagerDep(rel,
+			corev1.EnvVar{Name: "REQUESTS_PROXY_URL", Value: "http://requests-proxy-service:8888"},
+			corev1.EnvVar{Name: "CLIENT_ENV", Value: "dev"},
+			corev1.EnvVar{Name: "RESOURCE_REQUESTS", Value: "cpu=2,memory=8Gi"},
+		),
+		requestsProxyDep(rel, 1),
+		boundPVC(),
+		runningPod("tb-jobs-manager-abc"),
+		node("n1", "4", "16Gi"),
+	)
+
+	results := Run(bg(), cs, Options{
+		Namespace: ns,
+		HTTPProbe: func(context.Context, string) error { return nil },
+	})
+
+	if len(results) != 8 {
+		t.Fatalf("want 8 checks, got %d", len(results))
+	}
+	if w := Worst(results); w != StatusOK {
+		for _, r := range results {
+			t.Logf("%-32s %-4s %s", r.Name, r.Status, r.Detail)
+		}
+		t.Fatalf("healthy cluster worst = %v, want ok", w)
+	}
+}
+
+func node(name, cpu, mem string, gpu ...string) *corev1.Node {
+	alloc := corev1.ResourceList{
+		corev1.ResourceCPU:    resource.MustParse(cpu),
+		corev1.ResourceMemory: resource.MustParse(mem),
+	}
+	if len(gpu) == 2 {
+		alloc[corev1.ResourceName(gpu[0])] = resource.MustParse(gpu[1])
+	}
+	return &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{Name: name},
+		Status: corev1.NodeStatus{
+			Allocatable: alloc,
+			Conditions:  []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionTrue}},
+		},
+	}
+}
+
+func TestParseCPUMem(t *testing.T) {
+	cpu, mem, ok := parseCPUMem("cpu=2,memory=8Gi")
+	if !ok || cpu.String() != "2" || mem.String() != "8Gi" {
+		t.Fatalf("parseCPUMem => %q %q %v", cpu.String(), mem.String(), ok)
+	}
+	if _, _, ok := parseCPUMem("cpu=2"); ok {
+		t.Fatalf("missing memory should be !ok")
+	}
+	if _, _, ok := parseCPUMem("cpu=abc,memory=8Gi"); ok {
+		t.Fatalf("unparseable cpu should be !ok")
+	}
+}
+
+func TestParseGPU(t *testing.T) {
+	name, qty, req := parseGPU("nvidia.com/gpu=1")
+	if !req || string(name) != "nvidia.com/gpu" || qty.String() != "1" {
+		t.Fatalf("parseGPU => %q %q %v", name, qty.String(), req)
+	}
+	if _, _, req := parseGPU("nvidia.com/gpu=0"); req {
+		t.Fatalf("zero gpu should be !requested")
+	}
+	if _, _, req := parseGPU(""); req {
+		t.Fatalf("empty should be !requested")
+	}
+}
+
+func TestCheckNodeFit(t *testing.T) {
+	full := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi", "GPU_REQUESTS": "nvidia.com/gpu=1"}
+	cpuOnly := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi"}
+
+	t.Run("fits cpu+mem+gpu", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi", "nvidia.com/gpu", "2"))
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("no node big enough -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "1", "2Gi"))
+		if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+	t.Run("gpu requested but none -> warn", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi")) // cpu/mem fit, no gpu
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail)
+		}
+	})
+	t.Run("cpu+mem and gpu on different nodes -> warn, not ok", func(t *testing.T) {
+		// The Bugbot #91 case: one node fits cpu/mem, a different node has the
+		// GPU but is too small. No single node runs a GPU job → must NOT be ok.
+		cs := fake.NewClientset(
+			node("big", "4", "16Gi"),                       // cpu/mem, no gpu
+			node("gpu", "1", "1Gi", "nvidia.com/gpu", "2"), // gpu, too small
+		)
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn (no single node fits all)", r.Status, r.Detail)
+		}
+	})
+	t.Run("single node fits cpu+mem+gpu -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(
+			node("big", "4", "16Gi"),                         // distractor: cpu/mem only
+			node("full", "4", "16Gi", "nvidia.com/gpu", "1"), // satisfies everything
+		)
+		if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("not-ready node doesn't count -> fail", func(t *testing.T) {
+		n := node("n1", "8", "32Gi")
+		n.Status.Conditions = []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionFalse}}
+		cs := fake.NewClientset(n)
+		if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail (node not ready)", r.Status, r.Detail)
+		}
+	})
+	t.Run("missing RESOURCE_REQUESTS -> warn", func(t *testing.T) {
+		cs := fake.NewClientset(node("n1", "4", "16Gi"))
+		if r := checkNodeFit(bg(), cs, map[string]string{}); r.Status != StatusWarn {
+			t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail)
+		}
+	})
+}
+
+func dockerSecret(name string, data []byte) *corev1.Secret {
+	return &corev1.Secret{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Type:       corev1.SecretTypeDockerConfigJson,
+		Data:       map[string][]byte{corev1.DockerConfigJsonKey: data},
+	}
+}
+
+func jmDepWithPullSecret(release, secretName string) *appsv1.Deployment {
+	d := jobsManagerDep(release)
+	if secretName != "" {
+		d.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: secretName}}
+	}
+	return d
+}
+
+func TestCheckImagePull(t *testing.T) {
+	rel := &cluster.ParentRelease{ReleaseName: "tb"}
+
+	t.Run("no pull secret -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(jmDepWithPullSecret("tb", ""))
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("valid dockerconfigjson -> ok", func(t *testing.T) {
+		cs := fake.NewClientset(
+			jmDepWithPullSecret("tb", "reg"),
+			dockerSecret("reg", []byte(`{"auths":{}}`)),
+		)
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK {
+			t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail)
+		}
+	})
+	t.Run("missing secret -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(jmDepWithPullSecret("tb", "reg")) // secret absent
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+	t.Run("malformed dockerconfigjson -> fail", func(t *testing.T) {
+		cs := fake.NewClientset(
+			jmDepWithPullSecret("tb", "reg"),
+			dockerSecret("reg", []byte("not json")),
+		)
+		if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail {
+			t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail)
+		}
+	})
+}
diff --git a/internal/push/category.go b/internal/push/category.go
index c20728a..73f0ffd 100644
--- a/internal/push/category.go
+++ b/internal/push/category.go
@@ -1,72 +1,156 @@
 package push
 
-// Category families. These mirror data-ingestors'
+import "strings"
+
+// CategorySpec is the single source of truth for one task category's
+// CLI-relevant rules. It mirrors data-ingestors'
 // tracebloc_ingestor/cli/conventions.py groupings so the CLI's
-// per-category behaviour (which flags are required, which local
-// layout to expect, which spec fields to emit) stays in lock-step
+// per-category behaviour (which local layout to expect, which spec
+// fields to emit, whether a label policy is needed) stays in lock-step
 // with what the ingestor actually resolves.
 //
-// Kept as a single source of truth here rather than scattered
-// string comparisons across spec.go / dataset.go.
-
-// imageCategories take a labels CSV + an images/ directory (plus,
-// for some, extra sidecar dirs handled in later increments).
-var imageCategories = map[string]bool{
-	"image_classification":  true,
-	"object_detection":      true,
-	"keypoint_detection":    true,
-	"semantic_segmentation": true,
-	"instance_segmentation": true,
+// Everything category-shaped derives from the registry below — the
+// family predicates, the `--category` help text, the interactive
+// picker, and the push accept-gate — so the enumerations can't drift
+// apart (they used to: the flag help listed 5 of 9, cli#74).
+type CategorySpec struct {
+	// ID is the canonical category identifier; it matches the
+	// ingest.v1 schema enum (vendored via scripts/sync-schema.sh).
+	ID string
+	// Family selects the local layout + staging shape.
+	Family Family
+	// Label is the human-friendly name shown in the interactive picker.
+	Label string
+	// RegressionClass marks categories that predict a numeric target and
+	// therefore need label.policy (object label form) so the raw target
+	// never ships to the central backend by default.
+	RegressionClass bool
+	// CLISupported reports whether `dataset push` implements the category
+	// today. semantic_/instance_segmentation are known (the schema
+	// defines them) but not yet pushable.
+	CLISupported bool
+	// UnsupportedNote explains why a known-but-unimplemented category
+	// isn't available yet; surfaced by the push gate. Empty when supported.
+	UnsupportedNote string
+}
+
+// Family groups categories by local layout.
+type Family int
+
+const (
+	// FamilyImage: a labels CSV + an images/ directory (plus, for some,
+	// extra sidecar dirs like annotations/ or masks/).
+	FamilyImage Family = iota
+	// FamilyTabular: a single CSV whose columns are described by a
+	// `schema` (column → SQL type) map. No sidecar files.
+	FamilyTabular
+	// FamilyText: a labels CSV + a directory of text files (texts/ for
+	// classification, sequences/ for masked language modeling).
+	FamilyText
+)
+
+// categoryRegistry is the ordered, authoritative list of every category
+// the ingest.v1 schema defines. Order is the display order for help text
+// and the interactive picker (CLI-supported first, in workflow order;
+// the not-yet-implemented ones last). Adding a category to the schema
+// means adding it here — the parity test pins the set.
+var categoryRegistry = []CategorySpec{
+	{ID: "image_classification", Family: FamilyImage, Label: "Image classification", CLISupported: true},
+	{ID: "object_detection", Family: FamilyImage, Label: "Object detection", CLISupported: true},
+	{ID: "keypoint_detection", Family: FamilyImage, Label: "Keypoint detection", CLISupported: true},
+	{ID: "text_classification", Family: FamilyText, Label: "Text classification", CLISupported: true},
+	{ID: "masked_language_modeling", Family: FamilyText, Label: "Masked language modeling", CLISupported: true},
+	{ID: "tabular_classification", Family: FamilyTabular, Label: "Tabular classification", CLISupported: true},
+	{ID: "tabular_regression", Family: FamilyTabular, Label: "Tabular regression", RegressionClass: true, CLISupported: true},
+	{ID: "time_series_forecasting", Family: FamilyTabular, Label: "Time-series forecasting", RegressionClass: true, CLISupported: true},
+	{ID: "time_to_event_prediction", Family: FamilyTabular, Label: "Time-to-event prediction", RegressionClass: true, CLISupported: true},
+	{ID: "semantic_segmentation", Family: FamilyImage, Label: "Semantic segmentation", CLISupported: false,
+		UnsupportedNote: "blocked on the ingestor's mask-sidecar support (data-ingestors#136)"},
+	{ID: "instance_segmentation", Family: FamilyImage, Label: "Instance segmentation", CLISupported: false,
+		UnsupportedNote: "not implemented"},
+	{ID: "causal_language_modeling", Family: FamilyText, Label: "Causal language modeling", CLISupported: false,
+		UnsupportedNote: "schema-recognized (data-ingestors#805); `tracebloc ingest` discover/build for its raw-.txt / prompt\\tcompletion `texts` layout is pending"},
+	{ID: "token_classification", Family: FamilyText, Label: "Token classification", CLISupported: false,
+		UnsupportedNote: "schema-recognized; the CLI doesn't stage its per-token-label `texts` layout yet"},
+}
+
+// categoryByID indexes the registry for O(1) lookup, built once.
+var categoryByID = func() map[string]CategorySpec {
+	m := make(map[string]CategorySpec, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		m[c.ID] = c
+	}
+	return m
+}()
+
+// Lookup returns the spec for a category id and whether it is known.
+func Lookup(category string) (CategorySpec, bool) {
+	c, ok := categoryByID[category]
+	return c, ok
+}
+
+// IsKnown reports whether category is a recognized task category (in the
+// schema), supported by the CLI or not.
+func IsKnown(category string) bool {
+	_, ok := categoryByID[category]
+	return ok
 }
 
-// tabularCategories take a single CSV whose columns are described by
-// a `schema` (column → SQL type) map. No sidecar files.
-var tabularCategories = map[string]bool{
-	"tabular_classification":   true,
-	"tabular_regression":       true,
-	"time_series_forecasting":  true,
-	"time_to_event_prediction": true,
+// IsCLISupported reports whether `dataset push` implements category today.
+func IsCLISupported(category string) bool { return categoryByID[category].CLISupported }
+
+// IsImage reports whether category uses the labels.csv + images/ layout.
+func IsImage(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyImage
 }
 
-// regressionClassCategories predict a numeric target rather than a
-// class. The schema requires the label in object form with an
-// explicit `policy` so the raw target never ships to the central
-// backend by default (policy=bucket bins it first).
-var regressionClassCategories = map[string]bool{
-	"tabular_regression":       true,
-	"time_series_forecasting":  true,
-	"time_to_event_prediction": true,
+// IsTabular reports whether category uses the single-CSV + schema layout.
+func IsTabular(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyTabular
 }
 
-// textCategories take a labels CSV + a directory of text files
-// (texts/ for classification, sequences/ for masked language
-// modeling). masked_language_modeling additionally needs a
-// tokenizer.json at the dataset root and has NO label.
-var textCategories = map[string]bool{
-	"text_classification":      true,
-	"masked_language_modeling": true,
+// IsText reports whether category uses the labels.csv + text-file dir layout.
+func IsText(category string) bool {
+	c, ok := categoryByID[category]
+	return ok && c.Family == FamilyText
 }
 
-// IsImage reports whether category uses the labels.csv + images/
-// local layout.
-func IsImage(category string) bool { return imageCategories[category] }
+// IsRegressionClass reports whether category predicts a numeric target and
+// therefore needs label.policy (object label form).
+func IsRegressionClass(category string) bool { return categoryByID[category].RegressionClass }
 
-// IsTabular reports whether category uses the single-CSV + schema
-// local layout (no sidecar files).
-func IsTabular(category string) bool { return tabularCategories[category] }
+// SupportedCategoryIDs returns the ids `dataset push` supports, in display
+// order. Used to build the --category help, the interactive picker, and
+// the accept-gate's "Supported:" lists from one place.
+func SupportedCategoryIDs() []string {
+	ids := make([]string, 0, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		if c.CLISupported {
+			ids = append(ids, c.ID)
+		}
+	}
+	return ids
+}
 
-// IsRegressionClass reports whether category predicts a numeric
-// target and therefore needs label.policy (object label form).
-func IsRegressionClass(category string) bool { return regressionClassCategories[category] }
+// AllCategoryIDs returns every recognized category id, in registry order.
+func AllCategoryIDs() []string {
+	ids := make([]string, 0, len(categoryRegistry))
+	for _, c := range categoryRegistry {
+		ids = append(ids, c.ID)
+	}
+	return ids
+}
 
-// IsText reports whether category uses the labels.csv + text-file
-// directory (texts/ or sequences/) local layout.
-func IsText(category string) bool { return textCategories[category] }
+// SupportedCategoriesList is the comma-joined supported ids, for help text
+// and gate error messages.
+func SupportedCategoriesList() string { return strings.Join(SupportedCategoryIDs(), ", ") }
 
 // TextSidecarDir returns the sidecar directory name a text category
 // expects: "sequences" for masked_language_modeling, "texts" for
-// text_classification. (Used both as the local subdir to stage and
-// the spec field to emit.)
+// text_classification. (Used both as the local subdir to stage and the
+// spec field to emit.)
 func TextSidecarDir(category string) string {
 	if category == "masked_language_modeling" {
 		return "sequences"
diff --git a/internal/push/category_registry_test.go b/internal/push/category_registry_test.go
new file mode 100644
index 0000000..17ce1f3
--- /dev/null
+++ b/internal/push/category_registry_test.go
@@ -0,0 +1,134 @@
+package push
+
+import (
+	"encoding/json"
+	"sort"
+	"testing"
+
+	"github.com/tracebloc/cli/internal/schema"
+)
+
+// The registry is the single source of truth; these pin its contents and
+// that the family predicates + the supported set all derive from it, so a
+// future edit can't reintroduce the "5 of 9" drift (cli#74).
+
+func TestRegistryKnownCategories(t *testing.T) {
+	want := []string{
+		"image_classification", "object_detection", "keypoint_detection",
+		"semantic_segmentation", "instance_segmentation",
+		"text_classification", "token_classification",
+		"masked_language_modeling", "causal_language_modeling",
+		"tabular_classification", "tabular_regression",
+		"time_series_forecasting", "time_to_event_prediction",
+	}
+	if got := AllCategoryIDs(); !equalSet(got, want) {
+		t.Fatalf("AllCategoryIDs() = %v, want set %v", got, want)
+	}
+	for _, id := range want {
+		if !IsKnown(id) {
+			t.Errorf("IsKnown(%q) = false, want true", id)
+		}
+	}
+	if IsKnown("not_a_category") {
+		t.Error(`IsKnown("not_a_category") = true, want false`)
+	}
+}
+
+func TestSupportedCategories(t *testing.T) {
+	got := SupportedCategoryIDs()
+	if len(got) != 9 {
+		t.Fatalf("SupportedCategoryIDs() len = %d, want 9: %v", len(got), got)
+	}
+	for _, id := range got {
+		if !IsCLISupported(id) {
+			t.Errorf("SupportedCategoryIDs returned %q but IsCLISupported is false", id)
+		}
+	}
+	// segmentation + causal_language_modeling are known but not yet pushable,
+	// and must explain why.
+	for _, id := range []string{"semantic_segmentation", "instance_segmentation", "causal_language_modeling", "token_classification"} {
+		if !IsKnown(id) {
+			t.Errorf("%s should be known", id)
+		}
+		if IsCLISupported(id) {
+			t.Errorf("%s should not be CLI-supported yet", id)
+		}
+		if spec, _ := Lookup(id); spec.UnsupportedNote == "" {
+			t.Errorf("%s should carry an UnsupportedNote", id)
+		}
+	}
+}
+
+func TestPredicatesDeriveFromRegistry(t *testing.T) {
+	for _, c := range categoryRegistry {
+		switch c.Family {
+		case FamilyImage:
+			if !IsImage(c.ID) || IsTabular(c.ID) || IsText(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyImage", c.ID)
+			}
+		case FamilyTabular:
+			if !IsTabular(c.ID) || IsImage(c.ID) || IsText(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyTabular", c.ID)
+			}
+		case FamilyText:
+			if !IsText(c.ID) || IsImage(c.ID) || IsTabular(c.ID) {
+				t.Errorf("%s: predicates disagree with FamilyText", c.ID)
+			}
+		}
+		if IsRegressionClass(c.ID) != c.RegressionClass {
+			t.Errorf("%s: IsRegressionClass = %v, want %v", c.ID, IsRegressionClass(c.ID), c.RegressionClass)
+		}
+	}
+	// An unknown category: every predicate false (no panic on missing key).
+	if IsImage("nope") || IsTabular("nope") || IsText("nope") ||
+		IsRegressionClass("nope") || IsCLISupported("nope") {
+		t.Error("predicates should all be false for an unknown category")
+	}
+}
+
+// TestRegistryCoversSchemaCategories pins registry⇄schema parity: every
+// category the ingest schema accepts must be known to the registry, or a
+// schema-valid `dataset push --category=X` is wrongly rejected as
+// "unrecognized" (the token_classification drift, Bugbot v0.4.0 RC). The
+// existing tests only pin the registry against a hand-written list, which
+// stays internally consistent while drifting from the schema — this closes
+// that gap. The reverse direction isn't required: the registry may carry a
+// known-but-unsupported alias the v1 schema doesn't list yet (e.g.
+// instance_segmentation), which is gated out before schema validation.
+func TestRegistryCoversSchemaCategories(t *testing.T) {
+	var doc struct {
+		Properties struct {
+			Category struct {
+				Enum []string `json:"enum"`
+			} `json:"category"`
+		} `json:"properties"`
+	}
+	if err := json.Unmarshal(schema.V1Bytes, &doc); err != nil {
+		t.Fatalf("parse embedded schema: %v", err)
+	}
+	if len(doc.Properties.Category.Enum) == 0 {
+		t.Fatal("no category enum found in the embedded schema (parse path wrong?)")
+	}
+	for _, id := range doc.Properties.Category.Enum {
+		if !IsKnown(id) {
+			t.Errorf("schema category %q missing from the registry — `dataset push --category=%s` "+
+				"would be rejected as unrecognized despite passing schema validation", id, id)
+		}
+	}
+}
+
+func equalSet(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	as := append([]string(nil), a...)
+	bs := append([]string(nil), b...)
+	sort.Strings(as)
+	sort.Strings(bs)
+	for i := range as {
+		if as[i] != bs[i] {
+			return false
+		}
+	}
+	return true
+}
diff --git a/internal/push/teardown.go b/internal/push/teardown.go
index 5d30ccf..1c270fe 100644
--- a/internal/push/teardown.go
+++ b/internal/push/teardown.go
@@ -5,11 +5,11 @@ import (
 	"context"
 	"fmt"
 	"strings"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
-	"k8s.io/client-go/rest"
 )
 
 // IngestionDatabase is the MySQL schema jobs-manager ingests tables
@@ -49,24 +49,30 @@ type TeardownResult struct {
 	RemovedPaths []string
 }
 
-// Teardown performs the in-cluster teardown described by plan, mirroring
-// the manual kubectl-exec cleanup:
+// Teardown performs the in-cluster teardown described by plan:
 //
 //   - DROP the MySQL table by exec-ing `mysql` inside the mysql pod,
 //     referencing the pod's own $MYSQL_ROOT_PASSWORD — so no database
 //     credential ever transits the CLI.
-//   - rm -rf the PVC dirs by exec-ing inside the jobs-manager pod,
-//     which mounts the shared PVC at SharedRoot.
+//   - rm -rf the PVC dirs from a short-lived pod that mirrors the CLI's
+//     stage pod (uid 65532 + fsGroup 65532, shared PVC mounted) — built
+//     from podOpts via BuildStagePodSpec.
 //
-// DESIGN NOTE: this exec-into-existing-pods approach is the
-// "CLI-direct teardown" (the alternative considered was a server-side
-// jobs-manager delete endpoint). It assumes (a) a pod whose name
-// contains "mysql" exposes $MYSQL_ROOT_PASSWORD, and (b) the
-// jobs-manager pod mounts the shared PVC at SharedRoot — both true for
-// the current parent chart, but worth confirming before this ships.
-func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, namespace string, plan TeardownPlan) (TeardownResult, error) {
+// Why an ephemeral stage-identity pod and NOT the long-lived
+// jobs-manager pod (tracebloc/client#259): the staging files under
+// SharedRoot/.tracebloc-staging/<table> are written by the stage pod as
+// uid 65532 (and the ingestor's SharedRoot/<table> files as uid 65534).
+// The jobs-manager pod runs as a different non-root uid with no shared
+// fsGroup, so its `rm` hit EACCES on 65532-owned files in a
+// non-group-writable directory and left orphans. A teardown pod that
+// runs as the same uid that wrote the staging files OWNS them, so it
+// deletes them by ownership — which works on hostPath (where fsGroup is
+// a no-op, kubernetes/kubernetes#138411) and CSI alike.
+//
+// DESIGN NOTE: still assumes a pod whose name contains "mysql" exposes
+// $MYSQL_ROOT_PASSWORD (true for the current parent chart).
+func Teardown(ctx context.Context, cs kubernetes.Interface, exec Executor, namespace string, plan TeardownPlan, podOpts PodSpecOptions) (TeardownResult, error) {
 	var res TeardownResult
-	exec := &SPDYExecutor{Config: cfg, Client: cs}
 
 	// 1. DROP the table — mysql pod, localhost, its own root password.
 	mysqlPod, mysqlContainer, err := findRunningPod(ctx, cs, namespace, "mysql")
@@ -82,14 +88,28 @@ func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, na
 	}
 	res.DroppedTable = true
 
-	// 2. rm the PVC dirs — jobs-manager pod mounts the shared PVC.
-	jmPod, jmContainer, err := findRunningPod(ctx, cs, namespace, "jobs-manager")
+	// 2. rm the PVC dirs from an ephemeral stage-identity pod (see the
+	//    doc note above + #259). The pod owns the staging files it
+	//    deletes, so this works on hostPath and CSI.
+	podOpts.Namespace = namespace
+	podName, err := CreateStagePod(ctx, cs, podOpts)
 	if err != nil {
-		return res, fmt.Errorf("locating jobs-manager pod: %w", err)
+		return res, fmt.Errorf("creating teardown pod: %w", err)
+	}
+	// Always clean up the teardown pod — even if the wait/exec fails or
+	// the parent ctx is cancelled. Fresh ctx so the delete still reaches
+	// the API. Mirrors push.Stage's deferred cleanup.
+	defer func() {
+		delCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+		_ = DeleteStagePod(delCtx, cs, namespace, podName)
+	}()
+	if _, err := WaitForStagePodReady(ctx, cs, namespace, podName); err != nil {
+		return res, fmt.Errorf("waiting for teardown pod: %w", err)
 	}
 	stderr.Reset()
 	rmCmd := append([]string{"rm", "-rf"}, plan.PVCPaths...)
-	if err := exec.Exec(ctx, namespace, jmPod, jmContainer, rmCmd, nil, nil, &stderr); err != nil {
+	if err := exec.Exec(ctx, namespace, podName, "stage", rmCmd, nil, nil, &stderr); err != nil {
 		return res, fmt.Errorf("removing PVC paths: %w%s", err, stderrSuffix(&stderr))
 	}
 	res.RemovedPaths = plan.PVCPaths
diff --git a/internal/push/teardown_test.go b/internal/push/teardown_test.go
index a3e9444..d7275d7 100644
--- a/internal/push/teardown_test.go
+++ b/internal/push/teardown_test.go
@@ -1,6 +1,15 @@
 package push
 
-import "testing"
+import (
+	"context"
+	"strings"
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+	k8stesting "k8s.io/client-go/testing"
+)
 
 // TestPlanTeardown pins the artifact set `dataset rm` targets: the
 // MySQL table in IngestionDatabase + both PVC dirs (final dest +
@@ -28,3 +37,81 @@ func TestPlanTeardown(t *testing.T) {
 		}
 	}
 }
+
+// TestTeardown_RemovesViaStageIdentityPod is the regression test for
+// tracebloc/client#259: `dataset rm` must NOT run the file `rm` inside
+// the long-lived jobs-manager pod (a non-root uid that cannot delete
+// the uid-65532-owned staging files). It must run it in a short-lived
+// pod that mirrors the stage pod's identity (uid 65532 + fsGroup 65532),
+// which owns the staging files and so can delete them on any volume type
+// (hostPath included, where fsGroup is a no-op).
+func TestTeardown_RemovesViaStageIdentityPod(t *testing.T) {
+	// A running "mysql" pod must exist so step 1 (DROP TABLE) can locate
+	// it. The teardown pod is created by Teardown itself and marked Ready
+	// by the reactor (shared with stage_test.go).
+	cs := fake.NewClientset(&corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "mysql-0", Namespace: "tracebloc"},
+		Spec:       corev1.PodSpec{Containers: []corev1.Container{{Name: "mysql"}}},
+		Status:     corev1.PodStatus{Phase: corev1.PodRunning},
+	})
+	readyOnNextGet(cs)
+	fe := &fakeExecutor{}
+
+	plan := PlanTeardown("reg_train")
+	res, err := Teardown(context.Background(), cs, fe, "tracebloc", plan, PodSpecOptions{
+		Namespace:    "tracebloc",
+		PVCClaimName: "client-pvc",
+		PVCMountPath: "/data/shared",
+		Table:        "reg_train",
+	})
+	if err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+	if !res.DroppedTable {
+		t.Error("DroppedTable = false, want true")
+	}
+
+	// The rm is the LAST Exec call. It must target the ephemeral stage
+	// pod, NOT a jobs-manager pod — that's the #259 fix.
+	if !strings.HasPrefix(fe.gotPod, "tracebloc-stage-") {
+		t.Errorf("rm ran in pod %q, want the ephemeral stage pod (tracebloc-stage-*); "+
+			"running it in the jobs-manager pod is the #259 bug", fe.gotPod)
+	}
+	if strings.Contains(fe.gotPod, "jobs-manager") {
+		t.Errorf("rm ran in the jobs-manager pod (%q) — the #259 regression", fe.gotPod)
+	}
+	if fe.gotContainer != "stage" {
+		t.Errorf("rm container = %q, want stage", fe.gotContainer)
+	}
+	wantCmd := "rm -rf " + strings.Join(plan.PVCPaths, " ")
+	if got := strings.Join(fe.gotCmd, " "); got != wantCmd {
+		t.Errorf("rm cmd = %q, want %q", got, wantCmd)
+	}
+
+	// The teardown pod must run as the stage uid (65532) + fsGroup so it
+	// OWNS the staging files it deletes. Inspect the created Pod via the
+	// fake clientset action log (the pod is deleted before the test ends).
+	var sc *corev1.PodSecurityContext
+	for _, action := range cs.Actions() {
+		if action.GetVerb() == "create" && action.GetResource().Resource == "pods" {
+			sc = action.(k8stesting.CreateAction).GetObject().(*corev1.Pod).Spec.SecurityContext
+			break
+		}
+	}
+	if sc == nil {
+		t.Fatal("no Pod create observed — teardown did not spawn an ephemeral pod")
+	}
+	if sc.RunAsUser == nil || *sc.RunAsUser != 65532 {
+		t.Errorf("teardown pod RunAsUser = %v, want 65532", sc.RunAsUser)
+	}
+	if sc.FSGroup == nil || *sc.FSGroup != 65532 {
+		t.Errorf("teardown pod FSGroup = %v, want 65532", sc.FSGroup)
+	}
+
+	// No leaked teardown pods after a successful run.
+	pods, _ := cs.CoreV1().Pods("tracebloc").List(context.Background(),
+		metav1.ListOptions{LabelSelector: StagePodManagedByLabel + "=" + StagePodManagedByValue})
+	if len(pods.Items) != 0 {
+		t.Errorf("Teardown leaked %d stage pod(s)", len(pods.Items))
+	}
+}
diff --git a/internal/schema/ingest.v1.json b/internal/schema/ingest.v1.json
index d8eb83c..b5332b9 100644
--- a/internal/schema/ingest.v1.json
+++ b/internal/schema/ingest.v1.json
@@ -30,13 +30,14 @@
         "object_detection",
         "keypoint_detection",
         "semantic_segmentation",
-        "instance_segmentation",
         "text_classification",
+        "token_classification",
         "tabular_classification",
         "tabular_regression",
         "time_series_forecasting",
         "time_to_event_prediction",
-        "masked_language_modeling"
+        "masked_language_modeling",
+        "causal_language_modeling"
       ],
       "description": "Task category. Drives convention defaults: validators, data_format, default columns, default file extensions, default validator set."
     },
@@ -86,7 +87,7 @@
     "texts": {
       "type": "string",
       "minLength": 1,
-      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification."
+      "description": "Directory holding text files referenced by the labels CSV. Required for text_classification, token_classification, and causal_language_modeling (raw .txt: plain text, or a tab-separated prompt\\tcompletion pair)."
     },
 
     "sequences": {
@@ -143,7 +144,7 @@
       "items": { "type": "integer", "minimum": 1 },
       "minItems": 2,
       "maxItems": 2,
-      "description": "[height, width] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset."
+      "description": "[width, height] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset. The order matches PIL.Image.size and what ImageResolutionValidator expects."
     },
 
     "number_of_keypoints": {
@@ -198,7 +199,7 @@
               "items": { "type": "integer", "minimum": 1 },
               "minItems": 2,
               "maxItems": 2,
-              "description": "[height, width]. Image categories only. Default [512, 512]."
+              "description": "[width, height]. Image categories only. Default [512, 512]. The order matches PIL.Image.size and what ImageResolutionValidator expects."
             },
             "extension": {
               "type": "string",
@@ -288,8 +289,7 @@
               "image_classification",
               "object_detection",
               "keypoint_detection",
-              "semantic_segmentation",
-              "instance_segmentation"
+              "semantic_segmentation"
             ]
           }
         },
@@ -321,6 +321,14 @@
       },
       "then": { "required": ["texts"] }
     },
+    {
+      "description": "token_classification requires `texts`.",
+      "if": {
+        "properties": { "category": { "const": "token_classification" } },
+        "required": ["category"]
+      },
+      "then": { "required": ["texts"] }
+    },
     {
       "description": "masked_language_modeling requires `sequences`.",
       "if": {
@@ -329,6 +337,14 @@
       },
       "then": { "required": ["sequences"] }
     },
+    {
+      "description": "causal_language_modeling requires `texts` (raw .txt samples). It is self-supervised, so unlike text/token classification it does NOT require `label`.",
+      "if": {
+        "properties": { "category": { "const": "causal_language_modeling" } },
+        "required": ["category"]
+      },
+      "then": { "required": ["texts"] }
+    },
     {
       "description": "tabular and time-series categories require `schema`.",
       "if": {
@@ -388,8 +404,8 @@
               "object_detection",
               "keypoint_detection",
               "semantic_segmentation",
-              "instance_segmentation",
               "text_classification",
+              "token_classification",
               "tabular_classification"
             ]
           }
@@ -397,6 +413,21 @@
         "required": ["category"]
       },
       "then": { "required": ["label"] }
+    },
+    {
+      "description": "Self-supervised categories MUST NOT set `label`. The shipped CSV has no label column, and the framework registers no edge-label metadata for them. Setting `label` anyway used to ingest rows successfully, then crash at backend registration with a misleading HTTP 400 'No data found' (issue #213). Reject at submission instead.",
+      "if": {
+        "properties": {
+          "category": {
+            "enum": [
+              "masked_language_modeling",
+              "causal_language_modeling"
+            ]
+          }
+        },
+        "required": ["category"]
+      },
+      "then": { "not": { "required": ["label"] } }
     }
   ]
 }
diff --git a/internal/slug/slug.go b/internal/slug/slug.go
new file mode 100644
index 0000000..fb3f8a0
--- /dev/null
+++ b/internal/slug/slug.go
@@ -0,0 +1,94 @@
+// Package slug ports the RFC-0001 namespace-slug rule (backend
+// common/utils/slug.py, RFC-0001 Appendix B) to Go. A client's display name is
+// slugified ONCE at provisioning into the immutable Kubernetes namespace, so
+// this MUST stay in lock-step with the Python definition — the backend
+// validates exactly what this produces (RFC-0001 backend#830; provisioning +
+// namespace validation in backend#836).
+package slug
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+
+	"golang.org/x/text/unicode/norm"
+)
+
+// MaxLabelLength is the DNS-1123 label cap.
+const MaxLabelLength = 63
+
+var (
+	nonAlnum  = regexp.MustCompile(`[^a-z0-9]+`)
+	multiDash = regexp.MustCompile(`-+`)
+)
+
+// Slugify maps name to a DNS-1123 label, or "" if nothing survives. Mirrors
+// slug.slugify_dns1123: NFKD-transliterate to ASCII, lowercase, map every run
+// of non-alphanumerics to a single "-", trim leading/trailing "-", cap at 63.
+func Slugify(name string) string {
+	if name == "" {
+		return ""
+	}
+	s := strings.ToLower(toASCII(name))
+	s = nonAlnum.ReplaceAllString(s, "-")
+	// Redundant after the run-collapse above, but slug.py runs the identical
+	// second pass — mirror it so the two stay structurally in lock-step.
+	s = multiDash.ReplaceAllString(s, "-")
+	s = strings.Trim(s, "-")
+	if len(s) > MaxLabelLength {
+		s = s[:MaxLabelLength]
+	}
+	return strings.TrimRight(s, "-")
+}
+
+// Derive returns a UNIQUE DNS-1123 slug for name, avoiding taken. On collision
+// it appends -2, -3, … within the 63-char cap. If name slugifies to empty it
+// falls back to fallback; with an empty fallback it errors (an empty slug must
+// never silently become a namespace). Mirrors slug.derive_slug — except Go has
+// no nil string, so fallback=="" is treated as "no fallback" (erroring); the
+// Python distinguishes None from "" but no caller passes "", and erroring is
+// the safer side of that edge.
+func Derive(name string, taken []string, fallback string) (string, error) {
+	base := Slugify(name)
+	if base == "" {
+		if fallback == "" {
+			return "", fmt.Errorf("name %q slugifies to empty; a fallback is required", name)
+		}
+		if base = Slugify(fallback); base == "" {
+			base = fallback
+		}
+	}
+	set := make(map[string]struct{}, len(taken))
+	for _, t := range taken {
+		set[t] = struct{}{}
+	}
+	if _, clash := set[base]; !clash {
+		return base, nil
+	}
+	for n := 2; ; n++ {
+		suffix := fmt.Sprintf("-%d", n)
+		end := MaxLabelLength - len(suffix)
+		if end > len(base) {
+			end = len(base)
+		}
+		if end < 0 {
+			end = 0
+		}
+		cand := strings.TrimRight(base[:end], "-") + suffix
+		if _, clash := set[cand]; !clash {
+			return cand, nil
+		}
+	}
+}
+
+func toASCII(s string) string {
+	// NFKD decompose then drop non-ASCII — matches Python's
+	// unicodedata.normalize("NFKD", s).encode("ascii", "ignore").
+	var b strings.Builder
+	for _, r := range norm.NFKD.String(s) {
+		if r < 128 {
+			b.WriteByte(byte(r))
+		}
+	}
+	return b.String()
+}
diff --git a/internal/slug/slug_golden_test.go b/internal/slug/slug_golden_test.go
new file mode 100644
index 0000000..04e25c2
--- /dev/null
+++ b/internal/slug/slug_golden_test.go
@@ -0,0 +1,44 @@
+package slug
+
+import "testing"
+
+// goldenPairs are (display name → DNS-1123 slug) pairs verified BYTE-IDENTICAL
+// against the canonical Python slugify_dns1123 (backend common/utils/slug.py)
+// with a cross-language harness. They lock the NFKD transliteration — the one
+// place a Go port can silently drift from the backend validator that rejects
+// what this produces. If slug.py ever changes, re-run the harness and update
+// these rather than hand-editing.
+var goldenPairs = []struct{ name, want string }{
+	{"My Client", "my-client"},
+	{"café", "cafe"}, // canonical NFKD: é → e + accent (dropped)
+	{"CAFÉ", "cafe"}, // + lowercase
+	{"Müller GmbH", "muller-gmbh"},
+	{"Straße 42", "strae-42"}, // ß does not decompose → dropped (matches Python)
+	{"naïve", "naive"},
+	{"São Paulo", "sao-paulo"},
+	{"Zürich-Lab", "zurich-lab"},
+	{"北京 client", "client"},           // CJK dropped
+	{"client🚀rocket", "clientrocket"}, // emoji dropped
+	{"über_cool", "uber-cool"},
+	{"piñata", "pinata"},
+	{"Ω omega", "omega"},
+	{"ﬁ-ligature", "fi-ligature"},  // compat NFKD: ﬁ ligature → "fi"
+	{"①②③ circled", "123-circled"}, // circled digits → "123"
+	{"½ half", "12-half"},          // vulgar fraction → "1" + fraction-slash(dropped) + "2"
+	{"ＦＵＬＬ width", "full-width"},   // fullwidth → ASCII
+	{"trailing---dashes---", "trailing-dashes"},
+	{"UPPER MixED", "upper-mixed"},
+	{"a.b.c-d_e f", "a-b-c-d-e-f"},
+	{"2001: A Space Odyssey", "2001-a-space-odyssey"},
+	{"Ⅻ roman", "xii-roman"}, // roman numeral → "XII" → "xii"
+	{"²cubed³", "2cubed3"},   // super/subscripts → digits
+	{"Hello @ World #1", "hello-world-1"},
+}
+
+func TestSlugifyGoldenParity(t *testing.T) {
+	for _, p := range goldenPairs {
+		if got := Slugify(p.name); got != p.want {
+			t.Errorf("Slugify(%q) = %q, want %q (golden parity with slug.py)", p.name, got, p.want)
+		}
+	}
+}
diff --git a/internal/slug/slug_test.go b/internal/slug/slug_test.go
new file mode 100644
index 0000000..ac5c516
--- /dev/null
+++ b/internal/slug/slug_test.go
@@ -0,0 +1,45 @@
+package slug
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestSlugify(t *testing.T) {
+	cases := map[string]string{
+		"My Client":             "my-client",
+		"café":                  "cafe", // NFKD transliteration, matches slug.py
+		"  spaces  ":            "spaces",
+		"UPPER_Case":            "upper-case",
+		"a--b":                  "a-b", // collapse consecutive dashes
+		"--trim--":              "trim",
+		"":                      "",
+		"!!!":                   "",
+		strings.Repeat("a", 70): strings.Repeat("a", 63), // 63-char cap
+	}
+	for in, want := range cases {
+		if got := Slugify(in); got != want {
+			t.Errorf("Slugify(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestDerive(t *testing.T) {
+	if got, err := Derive("My Client", nil, ""); err != nil || got != "my-client" {
+		t.Errorf("no collision: got %q, err %v", got, err)
+	}
+	if got, _ := Derive("My Client", []string{"my-client"}, ""); got != "my-client-2" {
+		t.Errorf("collision: got %q, want my-client-2", got)
+	}
+	if got, _ := Derive("My Client", []string{"my-client", "my-client-2"}, ""); got != "my-client-3" {
+		t.Errorf("double collision: got %q, want my-client-3", got)
+	}
+	// CJK slugifies to empty → uses fallback
+	if got, _ := Derive("世界", nil, "client-abc"); got != "client-abc" {
+		t.Errorf("fallback: got %q, want client-abc", got)
+	}
+	// empty slug + no fallback → error (never a silent-empty namespace)
+	if _, err := Derive("!!!", nil, ""); err == nil {
+		t.Error("expected an error for empty slug with no fallback")
+	}
+}