diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d1c8dba..d927706 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,8 +14,13 @@ on: permissions: contents: read +concurrency: + group: build-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: schema-drift: + timeout-minutes: 10 name: Schema drift check # Verifies the embedded internal/schema/ingest.v1.json matches # tracebloc/data-ingestors' master. A green PR that silently @@ -30,6 +35,7 @@ jobs: run: ./scripts/sync-schema.sh --check test: + timeout-minutes: 15 name: Test runs-on: ubuntu-latest steps: @@ -52,6 +58,7 @@ jobs: run: go test -race -cover ./... lint: + timeout-minutes: 10 name: Lint runs-on: ubuntu-latest steps: @@ -103,6 +110,7 @@ jobs: misspell -error . build: + timeout-minutes: 20 name: Build (${{ matrix.os }}/${{ matrix.arch }}) runs-on: ubuntu-latest strategy: diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 660921c..10a0beb 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -21,8 +21,13 @@ on: permissions: contents: read +concurrency: + group: e2e-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: integration: + timeout-minutes: 30 name: Integration (kind) runs-on: ubuntu-latest # Skip on PRs that aren't explicitly opted in via the `e2e` label; diff --git a/.github/workflows/public-pii-gate-caller.yml b/.github/workflows/public-pii-gate-caller.yml new file mode 100644 index 0000000..4af6016 --- /dev/null +++ b/.github/workflows/public-pii-gate-caller.yml @@ -0,0 +1,14 @@ +name: Public PII gate + +# Per-repo caller for the public-repo PII gate. Blocks PRs whose title/body/ +# commits contain a denylisted customer/partner name or known secret. +# Logic lives in tracebloc/.github/.github/workflows/public-pii-gate.yml. + +on: + pull_request: + types: [opened, edited, reopened, synchronize, labeled, unlabeled] + +jobs: + pii-gate: + uses: tracebloc/.github/.github/workflows/public-pii-gate.yml@main + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1468f1c..0a8f889 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -44,8 +44,13 @@ permissions: contents: write # create / update the GitHub Release id-token: write # cosign keyless OIDC +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: release: + timeout-minutes: 20 name: Build + sign + publish runs-on: ubuntu-latest strategy: @@ -158,6 +163,7 @@ jobs: if-no-files-found: error publish: + timeout-minutes: 20 name: Aggregate + create GitHub Release runs-on: ubuntu-latest needs: release @@ -220,6 +226,7 @@ jobs: # add a HOMEBREW_TAP_TOKEN repo secret with write access to # tracebloc/homebrew-tap) when that repo exists. bump-homebrew-tap: + timeout-minutes: 20 name: Bump Homebrew tap formula runs-on: ubuntu-latest needs: publish diff --git a/internal/api/client.go b/internal/api/client.go new file mode 100644 index 0000000..f0e8187 --- /dev/null +++ b/internal/api/client.go @@ -0,0 +1,397 @@ +// Package api is the tracebloc CLI's client for the central backend REST API +// (browser login + client provisioning). It is distinct from internal/submit, +// which talks to the in-cluster jobs-manager: this one reaches the public +// backend over real TLS. RFC-0001 (backend#830); the device-flow endpoints +// (/device/code, /device/token) land in backend#835, so RequestDeviceCode / +// PollToken are written against the RFC 8628 spec and go live when that ships. +package api + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +// Backend environments (mirror CLIENT_ENV). +const ( + EnvDev = "dev" + EnvStg = "stg" + EnvProd = "prod" +) + +const defaultTimeout = 30 * time.Second + +// BaseURL maps a CLIENT_ENV value to the backend base URL — kept in lock-step +// with the installer's `_backend_url` and client-runtime's CLIENT_ENV→backend +// mapping. Unknown / empty → prod. +func BaseURL(env string) string { + switch strings.ToLower(env) { + case EnvDev: + return "https://dev-api.tracebloc.io" + case EnvStg: + return "https://stg-api.tracebloc.io" + default: + return "https://api.tracebloc.io" + } +} + +// ResolveEnv picks the backend env: an explicit value (a --env flag) wins, +// then $CLIENT_ENV, then prod. +func ResolveEnv(explicit string) string { + if explicit != "" { + return strings.ToLower(explicit) + } + if e := os.Getenv("CLIENT_ENV"); e != "" { + return strings.ToLower(e) + } + return EnvProd +} + +// Client talks to the backend REST API. Token (the user token from login) is +// optional: the device-flow endpoints are unauthenticated; provisioning calls +// set it. +type Client struct { + BaseURL string + Token string + HTTP *http.Client +} + +// New returns a Client for the given env. The HTTP client is proxy- and +// CA-aware — it honors HTTP(S)_PROXY/NO_PROXY and the system cert pool — +// because RFC-0001 must work behind a corporate / TLS-inspecting proxy +// (backend#830 Q1). Unlike internal/submit (in-cluster, no real TLS) this +// verifies certificates: it's the public backend. +func New(env string) *Client { + return &Client{ + BaseURL: strings.TrimRight(BaseURL(env), "/"), + HTTP: &http.Client{ + Timeout: defaultTimeout, + Transport: &http.Transport{Proxy: http.ProxyFromEnvironment}, + }, + } +} + +// APIError is a non-2xx response, with the remote body surfaced verbatim. +type APIError struct { + StatusCode int + Body string + URL string +} + +func (e *APIError) Error() string { + return fmt.Sprintf("%s returned HTTP %d: %s", e.URL, e.StatusCode, strings.TrimSpace(e.Body)) +} + +// post sends an optional JSON body and returns the status code + raw response. +func (c *Client) post(ctx context.Context, path string, body any) (int, []byte, error) { + var rdr io.Reader + if body != nil { + b, err := json.Marshal(body) + if err != nil { + return 0, nil, fmt.Errorf("encoding request: %w", err) + } + rdr = bytes.NewReader(b) + } + url := c.BaseURL + path + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, rdr) + if err != nil { + return 0, nil, fmt.Errorf("building request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + c.setAuth(req) + resp, err := c.HTTP.Do(req) + if err != nil { + return 0, nil, fmt.Errorf("POST %s: %w", url, err) + } + defer func() { _ = resp.Body.Close() }() + raw, err := io.ReadAll(resp.Body) + if err != nil { + return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err) + } + return resp.StatusCode, raw, nil +} + +// setAuth attaches the stored token as a Bearer credential. The login token is +// a ClientAccessToken, authenticated by the backend's +// ClientAccessTokenAuthentication (keyword "Bearer", backend#835) — NOT the +// legacy DRF "Token" scheme. +func (c *Client) setAuth(req *http.Request) { + if c.Token != "" { + req.Header.Set("Authorization", "Bearer "+c.Token) + } +} + +// get sends an authenticated GET and returns the status code + raw response. +func (c *Client) get(ctx context.Context, path string) (int, []byte, error) { + url := c.BaseURL + path + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return 0, nil, fmt.Errorf("building request: %w", err) + } + c.setAuth(req) + resp, err := c.HTTP.Do(req) + if err != nil { + return 0, nil, fmt.Errorf("GET %s: %w", url, err) + } + defer func() { _ = resp.Body.Close() }() + raw, err := io.ReadAll(resp.Body) + if err != nil { + return resp.StatusCode, nil, fmt.Errorf("reading response from %s: %w", url, err) + } + return resp.StatusCode, raw, nil +} + +// ── Device Authorization Grant (RFC 8628) — backend endpoints land in #835 ── + +// DeviceCodeResponse is the reply from POST /device/code. +type DeviceCodeResponse struct { + DeviceCode string `json:"device_code"` + UserCode string `json:"user_code"` + VerificationURI string `json:"verification_uri"` + VerificationURIComplete string `json:"verification_uri_complete"` + ExpiresIn int `json:"expires_in"` + Interval int `json:"interval"` +} + +// RequestDeviceCode starts the device flow. +func (c *Client) RequestDeviceCode(ctx context.Context) (*DeviceCodeResponse, error) { + url := c.BaseURL + "/device/code" + status, raw, err := c.post(ctx, "/device/code", nil) + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, &APIError{StatusCode: status, Body: string(raw), URL: url} + } + var out DeviceCodeResponse + if err := json.Unmarshal(raw, &out); err != nil { + return nil, fmt.Errorf("decoding device-code response: %w", err) + } + if out.DeviceCode == "" || out.UserCode == "" { + return nil, fmt.Errorf("device-code response missing device_code/user_code (got %q)", string(raw)) + } + return &out, nil +} + +// Device-flow poll outcomes (RFC 8628 §3.5): pending / slow_down mean "keep +// polling"; expired_token / access_denied are terminal. +var ( + ErrAuthorizationPending = errors.New("authorization_pending") + ErrSlowDown = errors.New("slow_down") + ErrExpiredToken = errors.New("expired_token") + ErrAccessDenied = errors.New("access_denied") +) + +// PollToken polls POST /device/token once. It returns the user token on +// approval, or one of the Err* sentinels (pending/slow_down → keep polling, +// expired/denied → stop), or an *APIError for anything else. +func (c *Client) PollToken(ctx context.Context, deviceCode string) (string, error) { + url := c.BaseURL + "/device/token" + status, raw, err := c.post(ctx, "/device/token", map[string]string{"device_code": deviceCode}) + if err != nil { + return "", err + } + var body struct { + Token string `json:"token"` + Error string `json:"error"` + } + _ = json.Unmarshal(raw, &body) // best-effort; the status + error field drive the result + if status >= 200 && status < 300 { + if body.Token == "" { + return "", fmt.Errorf("device-token success response missing token (got %q)", string(raw)) + } + return body.Token, nil + } + switch body.Error { + case "authorization_pending": + return "", ErrAuthorizationPending + case "slow_down": + return "", ErrSlowDown + case "expired_token": + return "", ErrExpiredToken + case "access_denied": + return "", ErrAccessDenied + } + return "", &APIError{StatusCode: status, Body: string(raw), URL: url} +} + +// ── Authenticated calls (Bearer ClientAccessToken) ── + +// Identity is the signed-in user, from GET /userinfo/. +type Identity struct { + Email string `json:"email"` + Type string `json:"type"` + Account string `json:"account"` +} + +// WhoAmI fetches the signed-in user from the backend, authenticating with the +// stored token (Bearer). It confirms the token is live and returns the account +// — `login` uses it to verify the credential it just obtained. Requires Token. +func (c *Client) WhoAmI(ctx context.Context) (*Identity, error) { + url := c.BaseURL + "/userinfo/" + status, raw, err := c.get(ctx, "/userinfo/") + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, &APIError{StatusCode: status, Body: string(raw), URL: url} + } + var id Identity + if err := json.Unmarshal(raw, &id); err != nil { + return nil, fmt.Errorf("decoding userinfo response: %w", err) + } + return &id, nil +} + +// ── Client provisioning (Bearer-authed) — backend#836, /edge-device/ ── + +// ProvisionedClient is a tracebloc client (machine), as returned by the +// EdgeDevice endpoints. +type ProvisionedClient struct { + ID int `json:"id"` + Name string `json:"first_name"` + Username string `json:"username"` + Namespace string `json:"namespace"` + Location string `json:"location"` + Status int `json:"status"` + // ClusterID is the kube-system namespace UID this client is anchored to + // (RFC-0001 §6.3 / backend#883). Empty on legacy / not-yet-backfilled clients. + ClusterID string `json:"cluster_id"` +} + +// CreateClientRequest is the POST /edge-device/ body. The account is stamped +// server-side from the token; password is the machine credential the caller +// generates (write-only on the backend). +type CreateClientRequest struct { + Name string `json:"first_name"` + Namespace string `json:"namespace"` + Location string `json:"location"` + Password string `json:"password"` + // ClusterID anchors the client to this cluster (the kube-system namespace UID) + // so create is get-or-create keyed on it (RFC-0001 §7.2 / backend#883). Omitted + // when the cluster identity can't be read (dual-mode / legacy → plain mint). + ClusterID string `json:"cluster_id,omitempty"` +} + +// AdminContact is one "ask an admin" entry from GET /edge-device/admins/. +type AdminContact struct { + Name string `json:"name"` + Email string `json:"email"` +} + +// CreateClient provisions a client — get-or-create keyed on cluster_id when one +// is supplied (RFC-0001 §7.2 / backend#883). The returned `adopted` is true when +// the backend matched an existing client for this cluster (HTTP 200, an idempotent +// re-run) and false when it minted a new one (HTTP 201). A 403 *APIError means the +// caller lacks CLIENT_WRITE (→ ask-an-admin, backend#836 Q4); a 409 *APIError means +// the cluster is bound to another account (cluster_conflict, R6). +func (c *Client) CreateClient(ctx context.Context, req CreateClientRequest) (pc *ProvisionedClient, adopted bool, err error) { + url := c.BaseURL + "/edge-device/" + status, raw, err := c.post(ctx, "/edge-device/", req) + if err != nil { + return nil, false, err + } + if status < 200 || status >= 300 { + return nil, false, &APIError{StatusCode: status, Body: string(raw), URL: url} + } + var out ProvisionedClient + if err := json.Unmarshal(raw, &out); err != nil { + return nil, false, fmt.Errorf("decoding create-client response: %w", err) + } + // 200 = adopted an existing client for this cluster_id; 201 = freshly minted. + return &out, status == http.StatusOK, nil +} + +// maxListPages bounds how many pages ListClients will follow — a backstop +// against a misbehaving `next` chain, set well above any real account. +const maxListPages = 100 + +// ListClients returns ALL clients in the caller's account (GET /edge-device/). +// The endpoint is DRF-paginated, so this follows `next` to the end — list, +// `use `, and create-time collision detection must see every client, not +// just the first page. Also tolerates a bare (unpaginated) list body. +func (c *Client) ListClients(ctx context.Context) ([]ProvisionedClient, error) { + var all []ProvisionedClient + path := "/edge-device/" + for pageNum := 0; path != ""; pageNum++ { + if pageNum >= maxListPages { + return nil, fmt.Errorf("client list exceeded %d pages — aborting", maxListPages) + } + reqURL := c.BaseURL + path + status, raw, err := c.get(ctx, path) + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, &APIError{StatusCode: status, Body: string(raw), URL: reqURL} + } + // Unpaginated deployment → a bare array. Only valid as the sole response + // (a paginated chain is a `{next,results}` object on every page), so guard + // to page 0 — a stray bare body mid-chain must not silently end the loop. + if pageNum == 0 { + var bare []ProvisionedClient + if err := json.Unmarshal(raw, &bare); err == nil { + return bare, nil + } + } + var body struct { + Next string `json:"next"` + Results []ProvisionedClient `json:"results"` + } + if err := json.Unmarshal(raw, &body); err != nil { + return nil, fmt.Errorf("decoding client list: %w", err) + } + all = append(all, body.Results...) + next, perr := nextPath(body.Next) + if perr != nil { + return nil, perr + } + path = next + } + return all, nil +} + +// nextPath reduces a DRF `next` link (an absolute URL) to the path+query this +// client appends to BaseURL. An empty link returns ("", nil) — the normal +// end of pages. A non-empty link that won't parse is an error, NOT a silent +// "", so the loop never quietly stops mid-list and returns only the pages +// seen so far (list / `use` / collision checks must see every client). +func nextPath(next string) (string, error) { + if next == "" { + return "", nil + } + u, err := url.Parse(next) + if err != nil { + return "", fmt.Errorf("client list: unparseable pagination link %q: %w", next, err) + } + if u.RawQuery != "" { + return u.Path + "?" + u.RawQuery, nil + } + return u.Path, nil +} + +// ListClientAdmins returns who in the account can provision (the ask-an-admin +// path), from GET /edge-device/admins/ (backend#836 Q4). +func (c *Client) ListClientAdmins(ctx context.Context) ([]AdminContact, error) { + url := c.BaseURL + "/edge-device/admins/" + status, raw, err := c.get(ctx, "/edge-device/admins/") + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, &APIError{StatusCode: status, Body: string(raw), URL: url} + } + var out []AdminContact + if err := json.Unmarshal(raw, &out); err != nil { + return nil, fmt.Errorf("decoding admins response: %w", err) + } + return out, nil +} diff --git a/internal/api/client_test.go b/internal/api/client_test.go new file mode 100644 index 0000000..e091b79 --- /dev/null +++ b/internal/api/client_test.go @@ -0,0 +1,255 @@ +package api + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" +) + +func TestBaseURL(t *testing.T) { + cases := map[string]string{ + "dev": "https://dev-api.tracebloc.io", + "stg": "https://stg-api.tracebloc.io", + "prod": "https://api.tracebloc.io", + "": "https://api.tracebloc.io", + "DEV": "https://dev-api.tracebloc.io", // case-insensitive + "weird": "https://api.tracebloc.io", // unknown -> prod + } + for env, want := range cases { + if got := BaseURL(env); got != want { + t.Errorf("BaseURL(%q) = %q, want %q", env, got, want) + } + } +} + +func TestResolveEnv(t *testing.T) { + t.Setenv("CLIENT_ENV", "stg") + if got := ResolveEnv("dev"); got != "dev" { + t.Errorf("explicit should win: got %q", got) + } + if got := ResolveEnv(""); got != "stg" { + t.Errorf("CLIENT_ENV should be used: got %q", got) + } + t.Setenv("CLIENT_ENV", "") + if got := ResolveEnv(""); got != "prod" { + t.Errorf("default should be prod: got %q", got) + } +} + +func TestRequestDeviceCode(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/device/code" || r.Method != http.MethodPost { + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + _, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + resp, err := c.RequestDeviceCode(context.Background()) + if err != nil { + t.Fatal(err) + } + if resp.UserCode != "WDJB-MJHT" || resp.Interval != 5 || resp.DeviceCode != "dc" { + t.Errorf("got %+v", resp) + } +} + +func TestPollTokenSequence(t *testing.T) { + var n int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + n++ + switch n { + case 1: + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"authorization_pending"}`)) + case 2: + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"slow_down"}`)) + default: + _, _ = w.Write([]byte(`{"token":"usertoken123"}`)) + } + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + ctx := context.Background() + if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrAuthorizationPending) { + t.Errorf("poll 1: want pending, got %v", err) + } + if _, err := c.PollToken(ctx, "dc"); !errors.Is(err, ErrSlowDown) { + t.Errorf("poll 2: want slow_down, got %v", err) + } + tok, err := c.PollToken(ctx, "dc") + if err != nil || tok != "usertoken123" { + t.Errorf("poll 3: want token, got %q / %v", tok, err) + } +} + +func TestPollTokenDenied(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"access_denied"}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + if _, err := c.PollToken(context.Background(), "dc"); !errors.Is(err, ErrAccessDenied) { + t.Errorf("want access_denied, got %v", err) + } +} + +func TestWhoAmI(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/userinfo/" || r.Method != http.MethodGet { + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + if got := r.Header.Get("Authorization"); got != "Bearer usertoken123" { + t.Errorf("auth header = %q, want %q", got, "Bearer usertoken123") + } + _, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","type":"DS","account":"Acme"}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + c.Token = "usertoken123" + id, err := c.WhoAmI(context.Background()) + if err != nil { + t.Fatal(err) + } + if id.Email != "ds@tracebloc.io" || id.Account != "Acme" { + t.Errorf("got %+v", id) + } +} + +func TestWhoAmIUnauthorized(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = w.Write([]byte(`{"detail":"Invalid token."}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + c.Token = "bad" + var ae *APIError + if _, err := c.WhoAmI(context.Background()); !errors.As(err, &ae) || ae.StatusCode != http.StatusUnauthorized { + t.Errorf("want APIError 401, got %v", err) + } +} + +func TestCreateClientMintAndAdopt(t *testing.T) { + for _, tc := range []struct { + name string + code int + wantAdopted bool + }{ + {"mint", http.StatusCreated, false}, + {"adopt", http.StatusOK, true}, + } { + t.Run(tc.name, func(t *testing.T) { + var sent CreateClientRequest + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/edge-device/" || r.Method != http.MethodPost { + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + _ = json.NewDecoder(r.Body).Decode(&sent) + w.WriteHeader(tc.code) + _, _ = w.Write([]byte(`{"id":5,"first_name":"c","namespace":"c","cluster_id":"uid-1"}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + pc, adopted, err := c.CreateClient(context.Background(), + CreateClientRequest{Name: "c", Namespace: "c", Password: "pw", ClusterID: "uid-1"}) + if err != nil { + t.Fatal(err) + } + if adopted != tc.wantAdopted { + t.Errorf("adopted = %v, want %v", adopted, tc.wantAdopted) + } + if sent.ClusterID != "uid-1" { + t.Errorf("cluster_id sent = %q, want uid-1", sent.ClusterID) + } + if pc.ClusterID != "uid-1" { + t.Errorf("cluster_id parsed = %q, want uid-1", pc.ClusterID) + } + }) + } +} + +func TestCreateClientConflict(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusConflict) + _, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + var ae *APIError + _, _, err := c.CreateClient(context.Background(), CreateClientRequest{ClusterID: "uid-1"}) + if !errors.As(err, &ae) || ae.StatusCode != http.StatusConflict { + t.Errorf("want APIError 409, got %v", err) + } +} + +// TestListClients_FollowsPagination guards that DRF pagination is still +// followed end-to-end after the nextPath refactor (page 1 → page 2 → done). +func TestListClients_FollowsPagination(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Query().Get("page") == "2" { + _, _ = w.Write([]byte(`{"next":"","results":[{"id":2,"first_name":"b","namespace":"b"}]}`)) + return + } + // DRF emits an absolute `next`; nextPath keeps only path+query. + _, _ = w.Write([]byte(`{"next":"http://x/edge-device/?page=2","results":[{"id":1,"first_name":"a","namespace":"a"}]}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + got, err := c.ListClients(context.Background()) + if err != nil { + t.Fatal(err) + } + if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 { + t.Fatalf("want 2 clients [1,2], got %+v", got) + } +} + +// TestListClients_UnparseableNextLink_IsError pins the Bugbot fix (v0.4.0 RC): +// a non-empty `next` the server sends that url.Parse rejects must be a hard +// error, never a silent truncation to the pages seen so far — otherwise list / +// `use` / namespace-collision checks would miss clients without any error. +func TestListClients_UnparseableNextLink_IsError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + // `next` carries a control byte (\u007f) → url.Parse fails. + _, _ = w.Write([]byte(`{"next":"http://x/\u007f","results":[{"id":1,"first_name":"a","namespace":"a"}]}`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + if _, err := c.ListClients(context.Background()); err == nil { + t.Fatal("expected an error on an unparseable next link, got nil (silent truncation)") + } +} + +// TestListClients_BareArrayUnpaginated covers the unpaginated deployment shape +// (a bare JSON array) — still returned as-is after the bare-decode was guarded +// to the first page. +func TestListClients_BareArrayUnpaginated(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`[{"id":1,"first_name":"a","namespace":"a"},{"id":2,"first_name":"b","namespace":"b"}]`)) + })) + defer srv.Close() + c := New("prod") + c.BaseURL = srv.URL + got, err := c.ListClients(context.Background()) + if err != nil { + t.Fatal(err) + } + if len(got) != 2 || got[0].ID != 1 || got[1].ID != 2 { + t.Fatalf("want 2 clients [1,2] from a bare array, got %+v", got) + } +} diff --git a/internal/cli/auth.go b/internal/cli/auth.go new file mode 100644 index 0000000..9176586 --- /dev/null +++ b/internal/cli/auth.go @@ -0,0 +1,212 @@ +package cli + +import ( + "context" + "errors" + "fmt" + "net/http" + "time" + + "github.com/spf13/cobra" + + "github.com/tracebloc/cli/internal/api" + "github.com/tracebloc/cli/internal/config" + "github.com/tracebloc/cli/internal/ui" +) + +// newLoginCmd implements `tracebloc login` — browser sign-in via the OAuth 2.0 +// Device Authorization Grant (RFC 8628). Works on a headless box: the CLI shows +// a URL + short code, the human approves in a browser on any device, and the +// CLI polls until a user token is issued and stored in ~/.tracebloc (0600). +// The backend endpoints land in backend#835; until then login reports that the +// backend doesn't support browser sign-in yet. +func newLoginCmd() *cobra.Command { + var envFlag string + cmd := &cobra.Command{ + Use: "login", + Short: "Sign in to tracebloc in your browser (device flow)", + Long: `Sign in to tracebloc. The CLI prints a URL + short code; open the URL +on any device (your laptop or phone), sign in the way you already do +(password, Google, or GitHub), and approve the code. The CLI stores a +user token in ~/.tracebloc (mode 0600). + +Works on a headless / SSH box — the browser and the CLI need not share a +machine. Honors HTTP(S)_PROXY / NO_PROXY for corporate-proxy networks.`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return runLogin(cmd.Context(), printerFor(cmd), envFlag) + }, + } + cmd.Flags().StringVar(&envFlag, "env", "", + "backend environment: dev|stg|prod (default: $CLIENT_ENV, then prod)") + return cmd +} + +// Test seams: the device flow makes real HTTP calls on a timer, so tests +// override the client factory (point it at an httptest server) and the poll +// clock (fire immediately) rather than hitting the network / wall clock. +var ( + newAPIClient = api.New + pollAfter = time.After +) + +func runLogin(ctx context.Context, p *ui.Printer, envFlag string) error { + cfg, err := config.Load() + if err != nil { + return &exitError{code: 1, err: err} + } + env := api.ResolveEnv(envFlag) + client := newAPIClient(env) + + dc, err := client.RequestDeviceCode(ctx) + if err != nil { + var ae *api.APIError + if errors.As(err, &ae) && ae.StatusCode == http.StatusNotFound { + return &exitError{code: 1, err: fmt.Errorf( + "this backend (%s) doesn't support browser login yet — the device-grant "+ + "endpoints land in backend#835: %w", env, err)} + } + return &exitError{code: 1, err: err} + } + + p.Section("Sign in to tracebloc") + uri := dc.VerificationURIComplete + if uri == "" { + uri = dc.VerificationURI + } + p.Field("open", uri) + p.Field("code", dc.UserCode) + p.Newline() + p.Hintf("Waiting for you to approve in the browser… (Ctrl-C to cancel)") + + interval := dc.Interval + if interval <= 0 { + interval = 5 + } + var deadline time.Time + if dc.ExpiresIn > 0 { + deadline = time.Now().Add(time.Duration(dc.ExpiresIn) * time.Second) + } + + for { + if !deadline.IsZero() && time.Now().After(deadline) { + return &exitError{code: 1, err: errors.New("login timed out — re-run `tracebloc login`")} + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-pollAfter(time.Duration(interval) * time.Second): + } + + tok, err := client.PollToken(ctx, dc.DeviceCode) + switch { + case err == nil: + cfg.Env = env + cfg.Token = tok + // Confirm the freshly-issued token actually authenticates, and + // capture the account to show + store. Best-effort: don't fail a + // successful sign-in just because this lookup couldn't run. + client.Token = tok + if id, werr := client.WhoAmI(ctx); werr == nil { + cfg.Email = id.Email + } + if err := cfg.Save(); err != nil { + return &exitError{code: 1, err: err} + } + p.Newline() + if cfg.Email != "" { + p.Successf("Signed in as %s. Token saved to ~/.tracebloc (0600).", cfg.Email) + } else { + p.Successf("Signed in. Token saved to ~/.tracebloc (0600).") + } + return nil + case errors.Is(err, api.ErrAuthorizationPending): + // not approved yet — keep polling + case errors.Is(err, api.ErrSlowDown): + // RFC 8628 §3.5: on slow_down the client MUST increase the poll + // interval by 5 seconds for this and all subsequent polls. + interval += 5 + case errors.Is(err, api.ErrExpiredToken): + return &exitError{code: 1, err: errors.New("the sign-in code expired — re-run `tracebloc login`")} + case errors.Is(err, api.ErrAccessDenied): + return &exitError{code: 1, err: errors.New("sign-in was denied in the browser")} + default: + return &exitError{code: 1, err: err} + } + } +} + +// newLogoutCmd implements `tracebloc logout` — clears the stored token. +func newLogoutCmd() *cobra.Command { + return &cobra.Command{ + Use: "logout", + Short: "Sign out (clear the stored token)", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + cfg, err := config.Load() + if err != nil { + return &exitError{code: 1, err: err} + } + if !cfg.SignedIn() { + printerFor(cmd).Hintf("Already signed out.") + return nil + } + cfg.Token = "" + cfg.Email = "" + // Also drop the active-client pointer: it's account-scoped, so leaving + // it would bleed into the next account's session (a later `login` as a + // different user would inherit a stale active client in `auth status` / + // `client list`). + cfg.ActiveClientID = "" + if err := cfg.Save(); err != nil { + return &exitError{code: 1, err: err} + } + printerFor(cmd).Successf("Signed out.") + return nil + }, + } +} + +// newAuthCmd is the `tracebloc auth` parent; today it carries `auth status`. +func newAuthCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "auth", + Short: "Inspect tracebloc authentication state", + } + cmd.AddCommand(newAuthStatusCmd()) + return cmd +} + +// newAuthStatusCmd implements `tracebloc auth status`. +func newAuthStatusCmd() *cobra.Command { + return &cobra.Command{ + Use: "status", + Short: "Show whether you're signed in, and to which backend", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + cfg, err := config.Load() + if err != nil { + return &exitError{code: 1, err: err} + } + p := printerFor(cmd) + if !cfg.SignedIn() { + p.Hintf("Not signed in. Run `tracebloc login`.") + return nil + } + env := cfg.Env + if env == "" { + env = api.EnvProd + } + p.Section("tracebloc auth") + p.Field("status", "signed in") + p.Field("backend", env) + if cfg.Email != "" { + p.Field("account", cfg.Email) + } + if cfg.ActiveClientID != "" { + p.Field("active client", cfg.ActiveClientID) + } + return nil + }, + } +} diff --git a/internal/cli/auth_test.go b/internal/cli/auth_test.go new file mode 100644 index 0000000..d17ce6e --- /dev/null +++ b/internal/cli/auth_test.go @@ -0,0 +1,217 @@ +package cli + +import ( + "bytes" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/tracebloc/cli/internal/api" + "github.com/tracebloc/cli/internal/config" +) + +// withTestBackend points the login command at an httptest server (via the +// newAPIClient seam), makes polling instant (pollAfter seam), and isolates the +// on-disk config to a temp dir. All are restored on cleanup. +func withTestBackend(t *testing.T, h http.HandlerFunc) { + t.Helper() + srv := httptest.NewServer(h) + t.Cleanup(srv.Close) + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + + origClient, origAfter := newAPIClient, pollAfter + newAPIClient = func(string) *api.Client { + return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()} + } + pollAfter = func(time.Duration) <-chan time.Time { + ch := make(chan time.Time, 1) + ch <- time.Time{} + return ch + } + t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter }) +} + +func runCmd(t *testing.T, args ...string) (string, error) { + t.Helper() + root := NewRootCmd(BuildInfo{Version: "test"}) + var out bytes.Buffer + root.SetOut(&out) + root.SetErr(&out) + root.SetArgs(args) + err := root.Execute() + return out.String(), err +} + +func TestLogin_FullFlow(t *testing.T) { + var polls int + withTestBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/device/code": + _, _ = w.Write([]byte(`{"device_code":"dc","user_code":"WDJB-MJHT","verification_uri":"https://x/activate","expires_in":600,"interval":5}`)) + case "/device/token": + polls++ + if polls == 1 { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"authorization_pending"}`)) + return + } + _, _ = w.Write([]byte(`{"token":"cat_abc"}`)) + case "/userinfo/": + if got := r.Header.Get("Authorization"); got != "Bearer cat_abc" { + t.Errorf("userinfo auth header = %q, want %q", got, "Bearer cat_abc") + } + _, _ = w.Write([]byte(`{"email":"ds@tracebloc.io","account":"Acme"}`)) + default: + t.Errorf("unexpected request path %s", r.URL.Path) + } + }) + + out, err := runCmd(t, "login") + if err != nil { + t.Fatalf("login: %v", err) + } + if polls != 2 { + t.Errorf("expected 2 polls (pending then token), got %d", polls) + } + cfg, _ := config.Load() + if cfg.Token != "cat_abc" { + t.Errorf("stored token = %q, want cat_abc", cfg.Token) + } + if cfg.Email != "ds@tracebloc.io" { + t.Errorf("stored email = %q, want ds@tracebloc.io", cfg.Email) + } + if !strings.Contains(out, "ds@tracebloc.io") { + t.Errorf("expected output to show the account, got:\n%s", out) + } +} + +// TestLogin_SlowDownBacksOffByFive pins RFC 8628 §3.5: on `slow_down` the poll +// interval must increase by 5 seconds, not 1. Captures the durations handed to +// the pollAfter seam. +func TestLogin_SlowDownBacksOffByFive(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + var polls int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/device/code": + _, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","verification_uri":"https://x/activate","expires_in":600,"interval":5}`)) + case "/device/token": + polls++ + if polls == 1 { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"slow_down"}`)) + return + } + _, _ = w.Write([]byte(`{"token":"cat_ok"}`)) + case "/userinfo/": + _, _ = w.Write([]byte(`{"email":"e@co","account":"A"}`)) + } + })) + t.Cleanup(srv.Close) + + origClient, origAfter := newAPIClient, pollAfter + newAPIClient = func(string) *api.Client { return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()} } + var waits []time.Duration + pollAfter = func(d time.Duration) <-chan time.Time { + waits = append(waits, d) + ch := make(chan time.Time, 1) + ch <- time.Time{} + return ch + } + t.Cleanup(func() { newAPIClient = origClient; pollAfter = origAfter }) + + if _, err := runCmd(t, "login"); err != nil { + t.Fatalf("login: %v", err) + } + if len(waits) < 2 { + t.Fatalf("expected >=2 polls, got waits=%v", waits) + } + if waits[0] != 5*time.Second { + t.Errorf("first poll wait = %v, want 5s (server interval)", waits[0]) + } + if waits[1] != 10*time.Second { + t.Errorf("post-slow_down wait = %v, want 10s (interval+5 per RFC 8628), not 6s", waits[1]) + } +} + +func TestLogin_BackendUnsupported(t *testing.T) { + withTestBackend(t, func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + _, err := runCmd(t, "login") + if err == nil || !strings.Contains(err.Error(), "doesn't support browser login") { + t.Errorf("want unsupported-backend error, got %v", err) + } + cfg, _ := config.Load() + if cfg.SignedIn() { + t.Error("must not store a token when the backend has no device endpoints") + } +} + +func TestLogin_Denied(t *testing.T) { + withTestBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/device/code": + _, _ = w.Write([]byte(`{"device_code":"dc","user_code":"X","interval":5}`)) + default: + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"access_denied"}`)) + } + }) + _, err := runCmd(t, "login") + if err == nil || !strings.Contains(err.Error(), "denied") { + t.Errorf("want access-denied error, got %v", err) + } +} + +func TestLogout(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + if err := (&config.Config{Token: "x", Email: "e@co", ActiveClientID: "7"}).Save(); err != nil { + t.Fatal(err) + } + out, err := runCmd(t, "logout") + if err != nil { + t.Fatal(err) + } + cfg, _ := config.Load() + if cfg.SignedIn() { + t.Error("expected to be signed out") + } + // The active-client pointer is account-scoped — it must not survive logout, + // or it bleeds into the next account's session. + if cfg.ActiveClientID != "" { + t.Errorf("active_client_id = %q after logout, want cleared", cfg.ActiveClientID) + } + if !strings.Contains(out, "Signed out") { + t.Errorf("got:\n%s", out) + } +} + +func TestAuthStatus_SignedIn(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + if err := (&config.Config{Env: "dev", Token: "x", Email: "ds@co"}).Save(); err != nil { + t.Fatal(err) + } + out, err := runCmd(t, "auth", "status") + if err != nil { + t.Fatal(err) + } + for _, want := range []string{"signed in", "ds@co", "dev"} { + if !strings.Contains(out, want) { + t.Errorf("status output missing %q, got:\n%s", want, out) + } + } +} + +func TestAuthStatus_NotSignedIn(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + out, err := runCmd(t, "auth", "status") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, "Not signed in") { + t.Errorf("got:\n%s", out) + } +} diff --git a/internal/cli/client.go b/internal/cli/client.go new file mode 100644 index 0000000..328bbc3 --- /dev/null +++ b/internal/cli/client.go @@ -0,0 +1,439 @@ +package cli + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "net/http" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/spf13/cobra" + + "github.com/tracebloc/cli/internal/api" + "github.com/tracebloc/cli/internal/cluster" + "github.com/tracebloc/cli/internal/config" + "github.com/tracebloc/cli/internal/slug" + "github.com/tracebloc/cli/internal/ui" +) + +// readClusterID reads the cluster's kube-system UID — the RFC-0001 idempotency +// anchor (§7.2 / backend#883). A package var so tests can stub it without a +// reachable cluster. +var readClusterID = cluster.ClusterID + +// newClientCmd wires the `tracebloc client` subtree — provisioning + selecting +// the client (machine) this host enrolls as. Consumes the backend provisioning +// endpoints (backend#836) with the user token from `tracebloc login`. +func newClientCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "client", + Short: "Provision and manage tracebloc clients (machines)", + Long: `Provision a tracebloc client for this machine and list/select clients +in your account. Requires sign-in first (` + "`tracebloc login`" + `).`, + } + cmd.AddCommand(newClientCreateCmd(), newClientListCmd(), newClientUseCmd()) + return cmd +} + +func newClientCreateCmd() *cobra.Command { + var name, location, kubeconfigPath, contextOverride, credentialFile string + var yes bool + cmd := &cobra.Command{ + Use: "create", + Short: "Provision a new client for this machine (--name, --location)", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return runClientCreate(cmd.Context(), printerFor(cmd), clientPrompter(), + clientCreateOpts{name: name, location: location, kubeconfigPath: kubeconfigPath, contextOverride: contextOverride, credentialFile: credentialFile, yes: yes}) + }, + } + cmd.Flags().StringVar(&name, "name", "", + "human-readable client name (shown on your dashboard + carbon reports)") + cmd.Flags().StringVar(&location, "location", "", + "location zone for carbon footprint (e.g. DE); prompted if omitted") + cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "", + "path to the kubeconfig for the target cluster (default: $KUBECONFIG, then ~/.kube/config) — read to anchor the client to this cluster") + cmd.Flags().StringVar(&contextOverride, "context", "", + "kubeconfig context for the target cluster (default: current-context)") + cmd.Flags().StringVar(&credentialFile, "credential-file", "", + "write the machine credential to this path (mode 0600, sourceable env) instead of printing it — for the installer to feed the chart (never shown on the terminal)") + cmd.Flags().BoolVar(&yes, "yes", false, "skip the confirmation prompt") + return cmd +} + +// clientCreateOpts bundles the `client create` inputs (flags + resolved prompts). +type clientCreateOpts struct { + name, location, kubeconfigPath, contextOverride, credentialFile string + yes bool +} + +func newClientListCmd() *cobra.Command { + return &cobra.Command{ + Use: "list", + Aliases: []string{"ls"}, + Short: "List the clients in your account", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return runClientList(cmd.Context(), printerFor(cmd)) + }, + } +} + +func newClientUseCmd() *cobra.Command { + return &cobra.Command{ + Use: "use ", + Short: "Enroll this machine as an existing client", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return runClientUse(cmd.Context(), printerFor(cmd), args[0]) + }, + } +} + +// clientPrompter returns the interactive prompter on a TTY, else nil (so +// commands fall back to flags-only and never block on a pipe / in CI). +func clientPrompter() prompter { + if isInteractiveTTY() { + return surveyPrompter{} + } + return nil +} + +// authedClient loads the signed-in config and returns a token-bearing API +// client, or an error telling the user to log in. +func authedClient() (*api.Client, *config.Config, error) { + cfg, err := config.Load() + if err != nil { + return nil, nil, err + } + if !cfg.SignedIn() { + return nil, nil, errors.New("not signed in — run `tracebloc login` first") + } + env := cfg.Env + if env == "" { + env = api.ResolveEnv("") + } + client := newAPIClient(env) + client.Token = cfg.Token + return client, cfg, nil +} + +func runClientCreate(ctx context.Context, p *ui.Printer, pr prompter, opts clientCreateOpts) error { + client, cfg, err := authedClient() + if err != nil { + return &exitError{code: 1, err: err} + } + + name, location := opts.name, opts.location + + // Gather inputs first (flags win; prompt only what's missing, and only on a + // TTY), then show one review + confirm — matching the dataset-push flow. + if name == "" { + if pr == nil { + return errMissingFlag("--name") + } + if name, err = pr.Input("Client name", "shown on your dashboard + carbon reports", "", validateNonEmpty); err != nil { + return mapClientErr(err) + } + } + if location == "" { + if pr == nil { + return errMissingFlag("--location") + } + // Never silent-empty: the prompt requires a non-empty zone. (Cloud / + // GeoIP auto-detect of a suggested default is a fast-follow.) + if location, err = pr.Input("Location zone (e.g. DE)", "physical zone, for the carbon footprint", "", validateNonEmpty); err != nil { + return mapClientErr(err) + } + } + + // Read the cluster anchor (kube-system UID) so create is get-or-create keyed on + // it — re-running on the same cluster adopts the existing client instead of + // minting a duplicate (RFC-0001 §7.2 / backend#883). Best-effort + never silent: + // if the cluster isn't reachable we provision WITHOUT an anchor (a plain mint) + // and say so, rather than blocking. + clusterID, cidErr := readClusterID(ctx, cluster.KubeconfigOptions{Path: opts.kubeconfigPath, Context: opts.contextOverride}) + if cidErr != nil { + p.Hintf("Couldn't read the target cluster's identity — provisioning without a cluster anchor, so re-running won't be idempotent. Point --kubeconfig/--context at the reachable cluster to enable that.") + } + + // Derive the namespace slug from the name, avoiding collisions with existing + // clients (best-effort: if the list call fails we still derive a base slug). + var existing []string + if clients, lerr := client.ListClients(ctx); lerr == nil { + for _, c := range clients { + // Skip the client already anchored to this cluster: a re-run adopts it + // (the backend keys on cluster_id), so its namespace isn't a collision. + // Counting it would bump the derived slug and show a namespace in the + // review that doesn't match the one actually adopted. + if clusterID != "" && c.ClusterID == clusterID { + continue + } + if c.Namespace != "" { + existing = append(existing, c.Namespace) + } + } + } + namespace, err := slug.Derive(name, existing, "client-"+randHex(4)) + if err != nil { + return &exitError{code: 1, err: err} + } + + if pr != nil && !opts.yes { + renderClientReview(p, name, namespace, location, clusterID) + ok, cerr := pr.Confirm("Provision this client?", true) + if cerr != nil { + return mapClientErr(cerr) + } + if !ok { + p.Hintf("Cancelled.") + return nil + } + } + + // The machine credential: the CLI generates the password, the backend stores + // it (write-only). Sent on every create but used only when minting — on an + // idempotent adopt the backend keeps the existing client's credential (§7.2), + // so the generated value is never printed in that case. + password := randHex(24) + pc, adopted, err := client.CreateClient(ctx, api.CreateClientRequest{ + Name: name, + Namespace: namespace, + Location: location, + Password: password, + ClusterID: clusterID, + }) + if err != nil { + var ae *api.APIError + if errors.As(err, &ae) { + switch ae.StatusCode { + case http.StatusForbidden: + return askAnAdmin(ctx, p, client) + case http.StatusConflict: + // Per RFC C.3 the only 409 on POST /edge-device/ is cluster_conflict + // (R6): this cluster_id is bound to another account. + return &exitError{code: 1, err: errors.New( + "this cluster is already registered to a different tracebloc account — " + + "sign in to that account, or ask your admin (cluster_conflict)")} + } + } + return &exitError{code: 1, err: err} + } + + cfg.ActiveClientID = strconv.Itoa(pc.ID) + + p.Newline() + if adopted { + // Idempotent re-run: the backend matched this cluster_id to an existing + // client and returned it — no new credential. (The existing-fleet R7 case, + // where the backend instead matches a live in-cluster TB_CLIENT_ID whose + // cluster_id is still null and the CLI backfills it via PATCH, is the + // installer's orchestration — #838 — not done here.) + p.Successf("This cluster is already registered as client %q (namespace %s) — adopted it.", pc.Name, pc.Namespace) + p.Hintf("No new credential issued; the existing one stands. This machine is set to enroll as client %d.", pc.ID) + if opts.credentialFile != "" { + // No password to hand over on adopt (it's write-only on the backend and + // the existing one stands). Emit id + namespace + an ADOPTED marker so the + // installer reconciles the existing release rather than expecting a fresh + // credential (#838). + if werr := writeClientCredential(opts.credentialFile, []string{ + "TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID), + "TB_NAMESPACE=" + pc.Namespace, + "TRACEBLOC_CLIENT_ADOPTED=1", + }); werr != nil { + return &exitError{code: 1, err: werr} + } + p.Hintf("Wrote client id + namespace to %s (no new credential — the existing one stands).", opts.credentialFile) + } + // Mirror the mint path: a config-save failure shouldn't bury the result — + // hint how to set the pointer by hand and still exit clean. + if serr := cfg.Save(); serr != nil { + p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID) + } + return nil + } + // Mint. With --credential-file the secret goes to a 0600 file (never the + // terminal — RFC §9 "secure by invisibility") for the installer to source; + // otherwise it's printed (the interim, until the installer drives this). + p.Successf("Provisioned client %q (namespace %s).", pc.Name, pc.Namespace) + if opts.credentialFile != "" { + if werr := writeClientCredential(opts.credentialFile, []string{ + "TRACEBLOC_CLIENT_ID=" + strconv.Itoa(pc.ID), + "TRACEBLOC_CLIENT_PASSWORD=" + password, + "TB_NAMESPACE=" + pc.Namespace, + }); werr != nil { + // The credential is the only copy — a write failure must be fatal, not a + // silent drop (the installer would have nothing to connect with). + return &exitError{code: 1, err: werr} + } + p.Hintf("Credential written to %s (mode 0600, not shown). This machine is set to enroll as client %d.", opts.credentialFile, pc.ID) + } else { + // Print the credential FIRST — it's the only copy (the backend stores only + // the hash), so a later config-save failure must never cost it. + p.Section("Machine credential — needed by the installer to connect this client") + p.Field("client id", strconv.Itoa(pc.ID)) + p.Field("username", pc.Username) + p.Field("password", password) + } + if serr := cfg.Save(); serr != nil { + p.Hintf("Couldn't save the active-client pointer (%v) — run `tracebloc client use %d` to set it.", serr, pc.ID) + } + return nil +} + +// writeClientCredential writes the machine credential to path (mode 0600) as a +// shell-sourceable env file — the installer (#838) sources it to feed the chart, +// so the secret lands in a 0600 file, never the terminal (RFC §9 never-show). The +// values are constrained charsets (numeric id, hex password, DNS-1123 slug), so +// no shell-escaping is needed. +// +// Written via a 0600 temp file + atomic rename rather than os.WriteFile: WriteFile +// only applies its perm bits when it *creates* the file, so a pre-existing target +// (a stale file, or one an attacker pre-creates world-readable) would keep its old +// mode and leak the secret — the 0600 guarantee must hold unconditionally. The temp +// also avoids following a symlink at the target and never leaves a half-written +// credential behind. +func writeClientCredential(path string, lines []string) error { + dir := filepath.Dir(path) + if dir != "" && dir != "." { + if err := os.MkdirAll(dir, 0o700); err != nil { + return fmt.Errorf("creating credential-file directory: %w", err) + } + } + body := "# tracebloc client credential — written by `tracebloc client create`.\n" + + "# Mode 0600; sourced by the installer. Do not commit or share.\n" + + strings.Join(lines, "\n") + "\n" + // CreateTemp makes the file 0600 by construction, in the target dir so the + // rename stays on one filesystem. + f, err := os.CreateTemp(dir, ".cred-*") + if err != nil { + return fmt.Errorf("writing credential file %s: %w", path, err) + } + tmp := f.Name() + if _, err := f.WriteString(body); err != nil { + _ = f.Close() + _ = os.Remove(tmp) + return fmt.Errorf("writing credential file %s: %w", path, err) + } + if err := f.Close(); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("writing credential file %s: %w", path, err) + } + if err := os.Rename(tmp, path); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("writing credential file %s: %w", path, err) + } + return nil +} + +// askAnAdmin renders the "you can't provision — here's who can" path (a 403 from +// the backend means no CLIENT_WRITE; backend#836 Q4). +func askAnAdmin(ctx context.Context, p *ui.Printer, client *api.Client) error { + p.Newline() + p.Hintf("You don't have permission to provision a client in this account.") + if admins, err := client.ListClientAdmins(ctx); err == nil && len(admins) > 0 { + p.Section("Ask one of these admins to provision it (or grant you access)") + for _, a := range admins { + label := a.Name + if label == "" { + label = a.Email + } + p.Field(label, a.Email) + } + } + return &exitError{code: 1, err: errors.New("provisioning requires CLIENT_WRITE permission")} +} + +func runClientList(ctx context.Context, p *ui.Printer) error { + client, cfg, err := authedClient() + if err != nil { + return &exitError{code: 1, err: err} + } + clients, err := client.ListClients(ctx) + if err != nil { + return &exitError{code: 1, err: err} + } + if len(clients) == 0 { + p.Hintf("No clients yet. Run `tracebloc client create`.") + return nil + } + p.Section("Clients in your account") + for _, c := range clients { + marker := "" + if strconv.Itoa(c.ID) == cfg.ActiveClientID { + marker = " (active)" + } + p.Field(strconv.Itoa(c.ID)+marker, + fmt.Sprintf("%s namespace=%s location=%s", c.Name, c.Namespace, c.Location)) + } + return nil +} + +func runClientUse(ctx context.Context, p *ui.Printer, id string) error { + client, cfg, err := authedClient() + if err != nil { + return &exitError{code: 1, err: err} + } + clients, err := client.ListClients(ctx) + if err != nil { + return &exitError{code: 1, err: err} + } + for _, c := range clients { + if strconv.Itoa(c.ID) == id { + cfg.ActiveClientID = id + if serr := cfg.Save(); serr != nil { + return &exitError{code: 1, err: serr} + } + p.Successf("This machine is now set to enroll as client %s (%s).", id, c.Name) + return nil + } + } + return &exitError{code: 1, err: fmt.Errorf( + "no client %s in your account — run `tracebloc client list` to see the ids", id)} +} + +// renderClientReview shows the assembled inputs before the confirm prompt, so +// the user sees the derived namespace and location before anything is created. +func renderClientReview(p *ui.Printer, name, namespace, location, clusterID string) { + p.Section("Review") + p.Field("name", name) + p.Field("namespace", namespace) + p.Field("location", location) + if clusterID != "" { + p.Field("cluster", clusterID+" (anchors this client — re-runs adopt it)") + } +} + +// errMissingFlag reports a required flag absent in a non-interactive run (no TTY +// to prompt — CI, a pipe, or output redirected). +func errMissingFlag(flag string) error { + return &exitError{code: 1, err: fmt.Errorf("%s is required (non-interactive — no TTY to prompt)", flag)} +} + +// validateNonEmpty rejects blank prompt input. +func validateNonEmpty(s string) error { + if strings.TrimSpace(s) == "" { + return errors.New("required") + } + return nil +} + +// mapClientErr turns a cancelled interactive prompt into a clean exit. +func mapClientErr(err error) error { + if errors.Is(err, errInteractiveCancelled) { + return nil + } + return &exitError{code: 1, err: err} +} + +// randHex returns nbytes of crypto-random data hex-encoded. +func randHex(nbytes int) string { + b := make([]byte, nbytes) + _, _ = rand.Read(b) // crypto/rand.Read does not fail on a healthy system + return hex.EncodeToString(b) +} diff --git a/internal/cli/client_test.go b/internal/cli/client_test.go new file mode 100644 index 0000000..dcdcc1c --- /dev/null +++ b/internal/cli/client_test.go @@ -0,0 +1,551 @@ +package cli + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/tracebloc/cli/internal/api" + "github.com/tracebloc/cli/internal/cluster" + "github.com/tracebloc/cli/internal/config" + "github.com/tracebloc/cli/internal/ui" +) + +// withClientBackend points the client commands at an httptest server (via the +// newAPIClient seam) and writes a signed-in config to a temp dir. It also stubs +// readClusterID to "no cluster" by default, so create tests never touch a real +// kubeconfig/cluster — tests that exercise the anchor override it via stubClusterID. +func withClientBackend(t *testing.T, h http.HandlerFunc) { + t.Helper() + srv := httptest.NewServer(h) + t.Cleanup(srv.Close) + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + if err := (&config.Config{Env: "dev", Token: "tok"}).Save(); err != nil { + t.Fatal(err) + } + orig := newAPIClient + newAPIClient = func(string) *api.Client { + return &api.Client{BaseURL: srv.URL, HTTP: srv.Client()} + } + t.Cleanup(func() { newAPIClient = orig }) + + origCID := readClusterID + readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) { + return "", errors.New("no cluster reachable (test default)") + } + t.Cleanup(func() { readClusterID = origCID }) +} + +// stubClusterID overrides the cluster-anchor read for a single test. +func stubClusterID(t *testing.T, uid string, err error) { + t.Helper() + orig := readClusterID + readClusterID = func(context.Context, cluster.KubeconfigOptions) (string, error) { + return uid, err + } + t.Cleanup(func() { readClusterID = orig }) +} + +func TestClientCreate_Success(t *testing.T) { + var body api.CreateClientRequest + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/edge-device/": + _, _ = w.Write([]byte(`[]`)) // no existing clients + case r.Method == http.MethodPost && r.URL.Path == "/edge-device/": + if got := r.Header.Get("Authorization"); got != "Bearer tok" { + t.Errorf("auth header = %q, want Bearer tok", got) + } + _ = json.NewDecoder(r.Body).Decode(&body) + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":5,"first_name":"my-client","username":"u-123","namespace":"my-client","location":"DE"}`)) + default: + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + }) + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true}); err != nil { + t.Fatalf("create: %v", err) + } + if body.Namespace != "my-client" || body.Location != "DE" || body.Password == "" { + t.Errorf("create body = %+v", body) + } + cfg, _ := config.Load() + if cfg.ActiveClientID != "5" { + t.Errorf("active client = %q, want 5", cfg.ActiveClientID) + } + if !strings.Contains(out.String(), "u-123") { + t.Errorf("output missing username:\n%s", out.String()) + } +} + +func TestClientCreate_AskAnAdmin(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/edge-device/" && r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.URL.Path == "/edge-device/" && r.Method == http.MethodPost: + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"detail":"no permission"}`)) + case r.URL.Path == "/edge-device/admins/": + _, _ = w.Write([]byte(`[{"name":"Ada","email":"ada@co.io"}]`)) + } + }) + var out bytes.Buffer + err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "my-client", location: "DE", yes: true}) + if err == nil || !strings.Contains(err.Error(), "CLIENT_WRITE") { + t.Errorf("want permission error, got %v", err) + } + if !strings.Contains(out.String(), "ada@co.io") { + t.Errorf("expected admins shown, got:\n%s", out.String()) + } +} + +func TestClientCreate_RequiresLogin(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) // no config → not signed in + err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "x", location: "DE", yes: true}) + if err == nil || !strings.Contains(err.Error(), "login") { + t.Errorf("want not-signed-in error, got %v", err) + } +} + +func TestClientList(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`[{"id":1,"first_name":"alpha","namespace":"alpha","location":"DE"},{"id":2,"first_name":"beta","namespace":"beta","location":"US"}]`)) + }) + var out bytes.Buffer + if err := runClientList(context.Background(), ui.New(&out)); err != nil { + t.Fatal(err) + } + for _, want := range []string{"alpha", "beta"} { + if !strings.Contains(out.String(), want) { + t.Errorf("list missing %q:\n%s", want, out.String()) + } + } +} + +func TestClientUse(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`[{"id":7,"first_name":"gamma","namespace":"gamma"}]`)) + }) + if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "7"); err != nil { + t.Fatal(err) + } + cfg, _ := config.Load() + if cfg.ActiveClientID != "7" { + t.Errorf("active = %q, want 7", cfg.ActiveClientID) + } + if err := runClientUse(context.Background(), ui.New(&bytes.Buffer{}), "99"); err == nil { + t.Error("expected an error for an unknown client id") + } +} + +func TestClientCreate_Interactive(t *testing.T) { + var body api.CreateClientRequest + posted := false + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/edge-device/": + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost && r.URL.Path == "/edge-device/": + posted = true + _ = json.NewDecoder(r.Body).Decode(&body) + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":9,"first_name":"Lab One","username":"u-9","namespace":"lab-one","location":"DE"}`)) + } + }) + confirmYes := true + pr := &fakePrompter{ + answers: map[string]string{ + "Client name": "Lab One", + "Location zone (e.g. DE)": "DE", + }, + confirm: &confirmYes, + } + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil { + t.Fatalf("interactive create: %v", err) + } + if !posted { + t.Fatal("expected a POST after the user confirmed") + } + if body.Name != "Lab One" || body.Namespace != "lab-one" || body.Location != "DE" { + t.Errorf("create body = %+v", body) + } + if !strings.Contains(out.String(), "Review") { + t.Errorf("expected a review section before the confirm, got:\n%s", out.String()) + } +} + +func TestClientCreate_InteractiveCancel(t *testing.T) { + posted := false + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + posted = true + } + _, _ = w.Write([]byte(`[]`)) + }) + confirmNo := false + pr := &fakePrompter{ + answers: map[string]string{ + "Client name": "Lab Two", + "Location zone (e.g. DE)": "US", + }, + confirm: &confirmNo, + } + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), pr, clientCreateOpts{}); err != nil { + t.Fatalf("declining the confirm should be a clean exit, got: %v", err) + } + if posted { + t.Error("no client should be created when the user declines the confirm") + } + if !strings.Contains(out.String(), "Cancelled") { + t.Errorf("expected a Cancelled note, got:\n%s", out.String()) + } +} + +func TestClientList_Paginated(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + if r.URL.Query().Get("page") == "2" { + _, _ = w.Write([]byte(`{"count":2,"next":null,"results":[{"id":2,"first_name":"beta","namespace":"beta"}]}`)) + return + } + // page 1: an absolute `next` link, like real DRF pagination + _, _ = fmt.Fprintf(w, `{"count":2,"next":"http://%s/edge-device/?page=2","results":[{"id":1,"first_name":"alpha","namespace":"alpha"}]}`, r.Host) + }) + var out bytes.Buffer + if err := runClientList(context.Background(), ui.New(&out)); err != nil { + t.Fatal(err) + } + for _, want := range []string{"alpha", "beta"} { + if !strings.Contains(out.String(), want) { + t.Errorf("paginated list missing %q (next not followed?):\n%s", want, out.String()) + } + } +} + +func TestClientCreate_CollisionSuffix(t *testing.T) { + var body api.CreateClientRequest + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + // an existing client already holds "my-client" + _, _ = w.Write([]byte(`[{"id":1,"first_name":"My Client","namespace":"my-client"}]`)) + case r.Method == http.MethodPost: + _ = json.NewDecoder(r.Body).Decode(&body) + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":2,"first_name":"My Client","username":"u-2","namespace":"my-client-2","location":"DE"}`)) + } + }) + if err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "My Client", location: "DE", yes: true}); err != nil { + t.Fatal(err) + } + if body.Namespace != "my-client-2" { + t.Errorf("namespace = %q, want my-client-2 (collision suffix not applied)", body.Namespace) + } +} + +func TestClientCreate_AnchorMint(t *testing.T) { + var body api.CreateClientRequest + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + _ = json.NewDecoder(r.Body).Decode(&body) + w.WriteHeader(http.StatusCreated) // 201 = minted + _, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"c","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil { + t.Fatalf("create: %v", err) + } + if body.ClusterID != "uid-1" { + t.Errorf("cluster_id sent = %q, want uid-1 (anchor not wired into the request)", body.ClusterID) + } + if !strings.Contains(out.String(), "Machine credential") { + t.Errorf("mint should print the credential, got:\n%s", out.String()) + } +} + +func TestClientCreate_AdoptIdempotent(t *testing.T) { + posts := 0 + var lastBody api.CreateClientRequest + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + posts++ + _ = json.NewDecoder(r.Body).Decode(&lastBody) + w.WriteHeader(http.StatusOK) // 200 = adopted an existing client + _, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"existing","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + + run := func() string { + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil { + t.Fatalf("adopt: %v", err) + } + return out.String() + } + // Two runs against the same cluster — real re-run idempotency: each POSTs once, + // both adopt the SAME existing client, neither prints a credential. + first, second := run(), run() + if posts != 2 { + t.Fatalf("posts = %d, want 2 (each run POSTs once)", posts) + } + for i, out := range []string{first, second} { + if !strings.Contains(out, "adopted") { + t.Errorf("run %d: adopt should say so, got:\n%s", i, out) + } + if strings.Contains(out, "Machine credential") { + t.Errorf("run %d: adopt must NOT print a credential, got:\n%s", i, out) + } + } + // The credential is still SENT on every create (the backend uses it only on a + // mint, §7.2) even though it's never printed on adopt. + if lastBody.Password == "" { + t.Error("password should still be sent in the adopt POST body") + } + cfg, _ := config.Load() + if cfg.ActiveClientID != "8" { + t.Errorf("active client = %q, want 8 (adopted id)", cfg.ActiveClientID) + } +} + +func TestClientCreate_CredentialFileMint(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + credPath := filepath.Join(t.TempDir(), "cred.env") + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, + clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil { + t.Fatalf("create: %v", err) + } + // never-show: the secret must NOT hit the terminal. + if strings.Contains(out.String(), "Machine credential") || strings.Contains(out.String(), "password") { + t.Errorf("credential must not be printed when --credential-file is set, got:\n%s", out.String()) + } + // the file is 0600 and carries the sourceable credential. + info, err := os.Stat(credPath) + if err != nil { + t.Fatalf("credential file not written: %v", err) + } + if perm := info.Mode().Perm(); perm != 0o600 { + t.Errorf("credential file mode = %o, want 600", perm) + } + kv := parseEnvFile(t, credPath) + if kv["TRACEBLOC_CLIENT_ID"] != "5" || kv["TB_NAMESPACE"] != "my-ns" || kv["TRACEBLOC_CLIENT_PASSWORD"] == "" { + t.Errorf("credential file = %v (want id=5, ns=my-ns, non-empty password)", kv) + } + // never-show, the real invariant: the minted password VALUE must not appear + // in stdout under any label (the string checks above are just a proxy). + if strings.Contains(out.String(), kv["TRACEBLOC_CLIENT_PASSWORD"]) { + t.Errorf("minted password leaked to the terminal:\n%s", out.String()) + } +} + +// TestClientCreate_CredentialFilePreexistingPerms locks in the 0600 guarantee +// when the target already exists with looser perms — os.WriteFile would have +// kept the stale mode and leaked the secret group/other-readable. +func TestClientCreate_CredentialFilePreexistingPerms(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + credPath := filepath.Join(t.TempDir(), "cred.env") + // A stale, world-readable file already sits at the target path. + if err := os.WriteFile(credPath, []byte("stale\n"), 0o644); err != nil { + t.Fatal(err) + } + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, + clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil { + t.Fatalf("create: %v", err) + } + info, err := os.Stat(credPath) + if err != nil { + t.Fatalf("credential file not written: %v", err) + } + if perm := info.Mode().Perm(); perm != 0o600 { + t.Errorf("credential file mode = %o over a pre-existing 0644 target, want 600", perm) + } +} + +// TestClientCreate_CredentialFileWriteFailFatal asserts a credential-file write +// failure is fatal — the minted password is the only copy, so a failed write must +// surface an error, never a silent drop. The target's parent is a regular file, so +// the directory create (hence the write) fails deterministically. +func TestClientCreate_CredentialFileWriteFailFatal(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":5,"first_name":"c","username":"u-5","namespace":"my-ns","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + notADir := filepath.Join(t.TempDir(), "iam-a-file") + if err := os.WriteFile(notADir, []byte("x"), 0o600); err != nil { + t.Fatal(err) + } + credPath := filepath.Join(notADir, "cred.env") // parent is a file → write fails + var out bytes.Buffer + err := runClientCreate(context.Background(), ui.New(&out), nil, + clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}) + if err == nil { + t.Fatal("expected a fatal error when the credential file can't be written, got nil") + } +} + +func TestClientCreate_CredentialFileAdopt(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + w.WriteHeader(http.StatusOK) // adopt + _, _ = w.Write([]byte(`{"id":8,"first_name":"existing","username":"u-8","namespace":"ex-ns","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + credPath := filepath.Join(t.TempDir(), "cred.env") + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, + clientCreateOpts{name: "c", location: "DE", yes: true, credentialFile: credPath}); err != nil { + t.Fatalf("adopt: %v", err) + } + kv := parseEnvFile(t, credPath) + // adopt emits id + namespace + the ADOPTED marker, but NO password (the + // existing one stands; it's write-only on the backend). + if kv["TRACEBLOC_CLIENT_ID"] != "8" || kv["TB_NAMESPACE"] != "ex-ns" || kv["TRACEBLOC_CLIENT_ADOPTED"] != "1" { + t.Errorf("adopt credential file = %v (want id=8, ns=ex-ns, ADOPTED=1)", kv) + } + if _, hasPw := kv["TRACEBLOC_CLIENT_PASSWORD"]; hasPw { + t.Errorf("adopt must not write a password (none issued), got:\n%v", kv) + } +} + +// parseEnvFile reads a KEY=value env file (skipping # comments) into a map. +func parseEnvFile(t *testing.T, path string) map[string]string { + t.Helper() + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read credential file: %v", err) + } + kv := map[string]string{} + for _, line := range strings.Split(string(raw), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if k, v, ok := strings.Cut(line, "="); ok { + kv[k] = v + } + } + return kv +} + +func TestClientCreate_ClusterConflict(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + w.WriteHeader(http.StatusConflict) // 409 = bound to another account (R6) + _, _ = w.Write([]byte(`{"error":"cluster_conflict","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + err := runClientCreate(context.Background(), ui.New(&bytes.Buffer{}), nil, clientCreateOpts{name: "c", location: "DE", yes: true}) + if err == nil || !strings.Contains(err.Error(), "different tracebloc account") { + t.Errorf("want a cluster_conflict error, got %v", err) + } +} + +func TestClientCreate_NoClusterAnchorWarns(t *testing.T) { + var body api.CreateClientRequest + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet: + _, _ = w.Write([]byte(`[]`)) + case r.Method == http.MethodPost: + _ = json.NewDecoder(r.Body).Decode(&body) + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"id":3,"first_name":"c","username":"u-3","namespace":"c","location":"DE"}`)) + } + }) + // readClusterID left at the withClientBackend default (returns an error). + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), nil, clientCreateOpts{name: "c", location: "DE", yes: true}); err != nil { + t.Fatalf("create: %v", err) + } + if body.ClusterID != "" { + t.Errorf("cluster_id sent = %q, want empty (no anchor when cluster unreadable)", body.ClusterID) + } + if !strings.Contains(out.String(), "without a cluster anchor") { + t.Errorf("expected a never-silent hint about the missing anchor, got:\n%s", out.String()) + } + // The no-anchor path must still complete a full mint — the credential is shown. + if !strings.Contains(out.String(), "Machine credential") { + t.Errorf("no-anchor fallback should still print the credential, got:\n%s", out.String()) + } +} + +// TestClientCreate_ReRunReviewShowsAdoptedNamespace pins that on an idempotent +// re-run, the client already anchored to this cluster is excluded from collision +// detection — so the review shows the namespace that's actually adopted +// (lab-one), not a bumped lab-one-2. +func TestClientCreate_ReRunReviewShowsAdoptedNamespace(t *testing.T) { + withClientBackend(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/edge-device/": + // An existing client already anchored to THIS cluster (uid-1). + _, _ = w.Write([]byte(`[{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}]`)) + case r.Method == http.MethodPost && r.URL.Path == "/edge-device/": + w.WriteHeader(http.StatusOK) // adopt + _, _ = w.Write([]byte(`{"id":1,"first_name":"Lab One","username":"u-1","namespace":"lab-one","location":"DE","cluster_id":"uid-1"}`)) + } + }) + stubClusterID(t, "uid-1", nil) + confirmYes := true + pr := &fakePrompter{answers: map[string]string{}, confirm: &confirmYes} + var out bytes.Buffer + if err := runClientCreate(context.Background(), ui.New(&out), pr, + clientCreateOpts{name: "Lab One", location: "DE"}); err != nil { + t.Fatalf("re-run create: %v", err) + } + if strings.Contains(out.String(), "lab-one-2") { + t.Errorf("review showed a bumped namespace — the cluster's own client wasn't excluded from collision detection:\n%s", out.String()) + } +} diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go index cea1b27..db2cd7a 100644 --- a/internal/cli/cluster.go +++ b/internal/cli/cluster.go @@ -13,11 +13,13 @@ import ( "github.com/tracebloc/cli/internal/ui" ) -// newClusterCmd wires the `tracebloc cluster` subtree. Today it has -// a single verb — `info` — which is the customer's "is the CLI -// pointing at the right cluster?" pre-flight before running -// `dataset push`. Future verbs (e.g. `cluster doctor` for -// diagnostics, `cluster contexts` for switching) hang off this +// newClusterCmd wires the `tracebloc cluster` subtree: +// - `info` — the customer's "is the CLI pointing at the right +// cluster?" pre-flight before running `dataset push`. +// - `doctor` — a read-only health sweep of the running release with +// ✔/⚠/✖ checks + remedies (epic client-runtime#116, WS3). +// +// Future verbs (e.g. `cluster contexts` for switching) hang off this // parent in later phases. func newClusterCmd() *cobra.Command { cmd := &cobra.Command{ @@ -33,6 +35,7 @@ the wrong cluster).`, } cmd.AddCommand(newClusterInfoCmd()) + cmd.AddCommand(newClusterDoctorCmd()) return cmd } diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go index 1b63d92..012fd2c 100644 --- a/internal/cli/dataset.go +++ b/internal/cli/dataset.go @@ -222,8 +222,7 @@ Exit codes: cmd.Flags().StringVar(&table, "table", "", "destination table name (MySQL identifier; matches /data/shared// on the PVC)") cmd.Flags().StringVar(&category, "category", "image_classification", - "task category: image_classification, tabular_classification, tabular_regression, "+ - "time_series_forecasting, time_to_event_prediction") + "task category, one of: "+push.SupportedCategoriesList()) cmd.Flags().StringVar(&intent, "intent", "", "intent: train|test") cmd.Flags().StringVar(&labelColumn, "label-column", "", @@ -422,24 +421,23 @@ contributors train against it without ever seeing the raw files.`)) case a.Spec.Category == "": // Left empty by a caller; let the schema produce the canonical // "category is required" error downstream. - case push.IsTabular(a.Spec.Category) || push.IsText(a.Spec.Category) || - a.Spec.Category == "image_classification" || - a.Spec.Category == "object_detection" || - a.Spec.Category == "keypoint_detection": + case push.IsCLISupported(a.Spec.Category): // supported - case push.IsImage(a.Spec.Category): - // semantic_segmentation / instance_segmentation + case push.IsKnown(a.Spec.Category): + // A recognized category dataset push doesn't implement yet — image + // (semantic_segmentation / instance_segmentation) or text + // (causal_language_modeling). Routed here (not the default branch) so the + // user gets the registry's per-category pending-support reason, not a + // misleading "unrecognized category". Supported categories were already + // caught above, so IsKnown here means known-but-unsupported. + spec, _ := push.Lookup(a.Spec.Category) return &exitError{code: 2, err: fmt.Errorf( - "category %q isn't supported by the CLI yet. semantic_segmentation is "+ - "blocked on the ingestor's mask-sidecar support (data-ingestors#136), and "+ - "instance_segmentation isn't implemented. Supported image categories: "+ - "image_classification, object_detection, keypoint_detection.", a.Spec.Category)} + "category %q isn't supported by the CLI yet (%s). Supported categories: %s.", + a.Spec.Category, spec.UnsupportedNote, push.SupportedCategoriesList())} default: return &exitError{code: 2, err: fmt.Errorf( - "category %q isn't a recognized task category. Supported: image_classification, "+ - "object_detection, keypoint_detection, text_classification, "+ - "masked_language_modeling, tabular_classification, tabular_regression, "+ - "time_series_forecasting, time_to_event_prediction.", a.Spec.Category)} + "category %q isn't a recognized task category. Supported categories: %s.", + a.Spec.Category, push.SupportedCategoriesList())} } // 3. Walk the local directory FIRST (local "fail fast"), dispatched diff --git a/internal/cli/dataset_rm.go b/internal/cli/dataset_rm.go index a7c398f..ecc83f6 100644 --- a/internal/cli/dataset_rm.go +++ b/internal/cli/dataset_rm.go @@ -178,18 +178,30 @@ undone — re-pushing the data is the only way back.`) } } - // 7. Execute the in-cluster teardown. + // 7. Execute the in-cluster teardown. File removal runs in a + // short-lived pod that shares the stage pod's identity (uid + // 65532 + fsGroup 65532), so it owns and can delete the staging + // files on any volume type — including hostPath, where fsGroup is + // a no-op (tracebloc/client#259). p.Infof("Removing in-cluster artifacts…") - res, err := push.Teardown(ctx, cs, resolved.RestConfig, resolved.Namespace, plan) + res, err := push.Teardown(ctx, cs, &push.SPDYExecutor{Config: resolved.RestConfig, Client: cs}, resolved.Namespace, plan, push.PodSpecOptions{ + Namespace: resolved.Namespace, + PVCClaimName: pvc.ClaimName, + PVCMountPath: pvc.MountPath, + Table: a.Table, + ServiceAccountName: release.IngestorSAName, + // Image left empty → push.DefaultStagePodImage (alpine; has rm). + }) if err != nil { - // Teardown is two sequential destructive ops; if the table drop - // succeeded but file removal didn't, say so — both ops are - // idempotent, so re-running completes the cleanup. (Bugbot #49) + // Two sequential destructive ops. If the table dropped but file + // removal didn't, the drop is idempotent (DROP TABLE IF EXISTS), + // so re-running is safe; if it keeps failing, remove the leftover + // staging dirs on the node directly. if res.DroppedTable { return &exitError{code: 7, err: fmt.Errorf( - "teardown incomplete — the table %s.%s was dropped, but removing its files failed; "+ - "re-run `tracebloc dataset rm %s` to remove the leftover files: %w", - plan.Database, plan.Table, a.Table, err)} + "teardown incomplete — the table %s.%s was dropped, but removing its files failed: %w; "+ + "re-run `tracebloc dataset rm %s`, or delete the leftover staging dirs on the node", + plan.Database, plan.Table, err, a.Table)} } return &exitError{code: 7, err: fmt.Errorf("teardown failed: %w", err)} } diff --git a/internal/cli/dataset_test.go b/internal/cli/dataset_test.go index 5d5cfcf..b8415b5 100644 --- a/internal/cli/dataset_test.go +++ b/internal/cli/dataset_test.go @@ -93,6 +93,37 @@ func TestDatasetPush_UnsupportedCategory_ExitsTwo(t *testing.T) { } } +// TestDatasetPush_KnownUnsupportedCategory_PendingNote pins the Bugbot fix +// (v0.4.0 RC): a registry-known but CLI-unsupported NON-image category +// (causal_language_modeling) must get the registry's pending-support note, not +// the misleading "isn't a recognized task category" message. execDatasetPush +// discards the error and SilenceErrors swallows it, so run the command here and +// inspect the returned error directly. +func TestDatasetPush_KnownUnsupportedCategory_PendingNote(t *testing.T) { + root := imgcLayout(t) + rootCmd := NewRootCmd(BuildInfo{Version: "test"}) + rootCmd.SetOut(&bytes.Buffer{}) + rootCmd.SetErr(&bytes.Buffer{}) + rootCmd.SetArgs([]string{"dataset", "push", + "--kubeconfig=/tmp/tracebloc-cli-test-nonexistent-" + t.Name(), + root, "--table=t1", "--category=causal_language_modeling", + "--intent=train", "--label-column=label"}) + err := rootCmd.Execute() + if err == nil { + t.Fatal("expected an error for a known-but-unsupported category") + } + if got := ExitCodeFromError(err); got != 2 { + t.Fatalf("exit code = %d, want 2", got) + } + msg := err.Error() + if strings.Contains(msg, "isn't a recognized task category") { + t.Errorf("known category misrouted to the unrecognized-category branch:\n%s", msg) + } + if !strings.Contains(msg, "isn't supported by the CLI yet") { + t.Errorf("want the registry pending-support note, got:\n%s", msg) + } +} + // TestDatasetPush_TraversalTableName_ExitsTwo is the security // regression pin at the CLI layer. --table=../../etc must be // rejected with exit 2 BEFORE any spec synthesis or cluster work — diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go new file mode 100644 index 0000000..71a484f --- /dev/null +++ b/internal/cli/doctor.go @@ -0,0 +1,133 @@ +package cli + +import ( + "context" + "fmt" + + "github.com/spf13/cobra" + + "github.com/tracebloc/cli/internal/cluster" + "github.com/tracebloc/cli/internal/doctor" + "github.com/tracebloc/cli/internal/ui" +) + +// newClusterDoctorCmd implements `tracebloc cluster doctor` — the sibling of +// `cluster info` that cluster.go's doc comment anticipated. Where `info` +// answers "is the CLI pointing at the right cluster?", `doctor` answers "is +// this running cluster healthy enough to run an experiment, and if not, what +// do I fix?" — a read-only, post-install health sweep with remedies +// (epic client-runtime#116, WS3). +// +// The three kubeconfig flags match `cluster info` exactly so muscle memory +// carries over; all are zero-value-safe. +func newClusterDoctorCmd() *cobra.Command { + var ( + kubeconfigPath string + contextOverride string + nsOverride string + ) + + cmd := &cobra.Command{ + Use: "doctor", + Short: "Diagnose a running tracebloc client cluster (✔/⚠/✖ health checks + remedies)", + Long: `Runs a read-only health sweep over the tracebloc client release in the +configured cluster + namespace and prints a ✔/⚠/✖ line per check with a +remedy for anything that isn't green: + + • Cluster reachable — the API answers and the client chart is installed + • Pod health — nothing crash-looping or stuck Pending + • Dataset volume — the shared PVC exists and is Bound + • Proxy configuration — the in-cluster requests/egress proxy wiring + • Backend egress — the tracebloc backend is reachable (from this machine) + • Service Bus egress — the requests-proxy that brokers experiment egress is Ready + +For a full redacted support bundle to send to tracebloc, use the installer's +` + "`./install-k8s.sh --diagnose`" + ` instead. + +Exit codes: + 0 all checks passed (or warnings only) + 2 one or more checks failed + 3 kubeconfig could not be loaded`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return runClusterDoctor( + cmd.Context(), + printerFor(cmd), + kubeconfigPath, contextOverride, nsOverride, + ) + }, + } + + cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "", + "path to the kubeconfig file (default: $KUBECONFIG, then ~/.kube/config)") + cmd.Flags().StringVar(&contextOverride, "context", "", + "name of the kubeconfig context to use (default: kubeconfig's current-context)") + cmd.Flags().StringVarP(&nsOverride, "namespace", "n", "", + "namespace where the parent tracebloc/client release is installed (default: the context's namespace, or 'default')") + + return cmd +} + +func runClusterDoctor( + ctx context.Context, + p *ui.Printer, + kubeconfigPath, contextOverride, nsOverride string, +) error { + p.Banner("tracebloc", "cluster doctor") + + resolved, err := cluster.Load(cluster.KubeconfigOptions{ + Path: kubeconfigPath, + Context: contextOverride, + Namespace: nsOverride, + }) + if err != nil { + // 3 = kubeconfig file/parse problem (same class as cluster info). + return &exitError{code: 3, err: fmt.Errorf("loading kubeconfig: %w", err)} + } + + cs, err := cluster.NewClientset(resolved) + if err != nil { + return &exitError{code: 3, err: err} + } + + p.Section("Kubeconfig") + p.Field("context", resolved.Context) + p.Field("server", resolved.ServerURL) + p.Field("namespace", resolved.Namespace) + + results := doctor.Run(ctx, cs, doctor.Options{Namespace: resolved.Namespace}) + + p.Section("Checks") + for _, r := range results { + switch r.Status { + case doctor.StatusOK: + p.Successf("%s — %s", r.Name, r.Detail) + case doctor.StatusWarn: + p.Warnf("%s — %s", r.Name, r.Detail) + if r.Remedy != "" { + p.Hintf(" %s", r.Remedy) + } + case doctor.StatusFail: + p.Errorf("%s — %s", r.Name, r.Detail) + if r.Remedy != "" { + p.Hintf(" %s", r.Remedy) + } + } + } + + p.Newline() + switch doctor.Worst(results) { + case doctor.StatusFail: + p.Errorf("Problems found — fix the ✖ items above.") + p.Hintf("For deeper triage, send tracebloc a support bundle: ./install-k8s.sh --diagnose") + // Silent (err == nil): the per-check lines above already explained it, + // so main() shouldn't print a redundant "Error:" line. + return &exitError{code: 2, err: nil} + case doctor.StatusWarn: + p.Warnf("Completed with warnings — review the ⚠ items above.") + return nil + default: + p.Successf("All checks passed — the cluster looks healthy.") + return nil + } +} diff --git a/internal/cli/interactive.go b/internal/cli/interactive.go index 3c567bf..bb6478d 100644 --- a/internal/cli/interactive.go +++ b/internal/cli/interactive.go @@ -16,20 +16,12 @@ import ( ) // promptCategories is the ordered list offered by the interactive -// category picker — the categories `dataset push` supports today (the -// same set runDatasetPush's category gate accepts). semantic_ / -// instance_segmentation are omitted until they're implemented. -var promptCategories = []string{ - "image_classification", - "object_detection", - "keypoint_detection", - "text_classification", - "masked_language_modeling", - "tabular_classification", - "tabular_regression", - "time_series_forecasting", - "time_to_event_prediction", -} +// category picker. It derives from the push registry's CLI-supported +// set — the exact categories runDatasetPush's gate accepts — so the +// picker can't drift from what `dataset push` actually supports. +// semantic_/instance_segmentation are excluded (CLISupported=false) +// until they're implemented. +var promptCategories = push.SupportedCategoryIDs() // prompter is the narrow seam over the interactive library. Production // uses surveyPrompter (a real terminal); tests inject a fake that diff --git a/internal/cli/root.go b/internal/cli/root.go index f1047d6..0bf6f8c 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -81,6 +81,11 @@ what's planned next.`, root.AddCommand(newIngestCmd()) root.AddCommand(newClusterCmd()) root.AddCommand(newDatasetCmd()) + // RFC-0001 (backend#830): browser sign-in + client provisioning. + root.AddCommand(newLoginCmd()) + root.AddCommand(newLogoutCmd()) + root.AddCommand(newAuthCmd()) + root.AddCommand(newClientCmd()) // Bare `tracebloc` (no subcommand) renders a friendly home screen // instead of cobra's raw usage dump. Subcommands and --help are diff --git a/internal/cluster/identity.go b/internal/cluster/identity.go new file mode 100644 index 0000000..f33f14c --- /dev/null +++ b/internal/cluster/identity.go @@ -0,0 +1,50 @@ +package cluster + +import ( + "context" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// clusterIDReadTimeout bounds the best-effort anchor read in ClusterID: a +// kubeconfig pointing at an unreachable API server would otherwise hang the +// kube-system GET for the OS TCP timeout, stalling a `client create` that is +// meant to degrade to a non-anchored mint instead. +const clusterIDReadTimeout = 8 * time.Second + +// ClusterID reads the kube-system namespace UID — the stable per-cluster +// fingerprint RFC-0001 keys client idempotency on (§6.3 / §7.2; backend#883). +// It needs a reachable cluster and RBAC to GET namespaces/kube-system. Callers +// treat a failure as "couldn't read the cluster identity" and fall back to a +// non-anchored (dual-mode) provision — they must not block on it. +func ClusterID(ctx context.Context, opts KubeconfigOptions) (string, error) { + rc, err := Load(opts) + if err != nil { + return "", err + } + // Best-effort read — callers must not block on it (see the doc above), so cap + // it; otherwise an unreachable API server hangs the GET below. + rc.RestConfig.Timeout = clusterIDReadTimeout + cs, err := NewClientset(rc) + if err != nil { + return "", err + } + return clusterIDFrom(ctx, cs) +} + +// clusterIDFrom reads the kube-system UID from a clientset. Split out so it can be +// exercised with a fake clientset without a real cluster. +func clusterIDFrom(ctx context.Context, cs kubernetes.Interface) (string, error) { + ns, err := cs.CoreV1().Namespaces().Get(ctx, "kube-system", metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("reading kube-system namespace UID: %w", err) + } + uid := string(ns.UID) + if uid == "" { + return "", fmt.Errorf("kube-system namespace has no UID") + } + return uid, nil +} diff --git a/internal/cluster/identity_test.go b/internal/cluster/identity_test.go new file mode 100644 index 0000000..63c6d46 --- /dev/null +++ b/internal/cluster/identity_test.go @@ -0,0 +1,30 @@ +package cluster + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func TestClusterIDFrom(t *testing.T) { + cs := fake.NewClientset(&corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "kube-system", UID: "uid-abc-123"}, + }) + got, err := clusterIDFrom(context.Background(), cs) + if err != nil { + t.Fatal(err) + } + if got != "uid-abc-123" { + t.Errorf("cluster id = %q, want uid-abc-123", got) + } +} + +func TestClusterIDFrom_NoNamespace(t *testing.T) { + cs := fake.NewClientset() // empty cluster — no kube-system + if _, err := clusterIDFrom(context.Background(), cs); err == nil { + t.Error("expected an error when kube-system is absent") + } +} diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..5145fad --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,122 @@ +// Package config persists the tracebloc CLI's user state — the backend +// environment, the user token from `tracebloc login`, and the active client +// for this machine — at ~/.tracebloc/config.json (mode 0600, it holds a +// token). RFC-0001 (backend#830). +package config + +import ( + "encoding/json" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" +) + +// Config is the on-disk CLI state. Every field is omitempty so a +// partially-configured file stays small and forward-compatible. +type Config struct { + Env string `json:"env,omitempty"` // dev|stg|prod (mirrors CLIENT_ENV) + Email string `json:"email,omitempty"` // who is signed in (display only) + Token string `json:"token,omitempty"` // user token from device login + ActiveClientID string `json:"active_client_id,omitempty"` // client this machine enrolls as +} + +// Dir is the config directory: $TRACEBLOC_CONFIG_DIR if set (tests / ops +// override), else ~/.tracebloc. +func Dir() (string, error) { + if d := os.Getenv("TRACEBLOC_CONFIG_DIR"); d != "" { + return d, nil + } + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("locating home directory: %w", err) + } + return filepath.Join(home, ".tracebloc"), nil +} + +// Path is the config file path (Dir()/config.json). +func Path() (string, error) { + dir, err := Dir() + if err != nil { + return "", err + } + return filepath.Join(dir, "config.json"), nil +} + +// Load reads the config. A missing file is NOT an error — it returns an empty +// Config (a fresh machine that has never run `login`). +func Load() (*Config, error) { + path, err := Path() + if err != nil { + return nil, err + } + data, err := os.ReadFile(path) + if errors.Is(err, fs.ErrNotExist) { + return &Config{}, nil + } + if err != nil { + return nil, fmt.Errorf("reading %s: %w", path, err) + } + var c Config + if err := json.Unmarshal(data, &c); err != nil { + return nil, fmt.Errorf("parsing %s: %w", path, err) + } + return &c, nil +} + +// Save writes the config 0600 (creating the dir 0700), atomically: a temp file +// in the same dir then a rename, so a crash mid-write can't truncate the token. +func (c *Config) Save() error { + dir, err := Dir() + if err != nil { + return err + } + if err := os.MkdirAll(dir, 0o700); err != nil { + return fmt.Errorf("creating %s: %w", dir, err) + } + data, err := json.MarshalIndent(c, "", " ") + if err != nil { + return fmt.Errorf("encoding config: %w", err) + } + data = append(data, '\n') + + tmp, err := os.CreateTemp(dir, ".config-*.json") + if err != nil { + return fmt.Errorf("creating temp config: %w", err) + } + tmpName := tmp.Name() + defer func() { _ = os.Remove(tmpName) }() // no-op after a successful rename + if err := tmp.Chmod(0o600); err != nil { + _ = tmp.Close() + return fmt.Errorf("chmod temp config: %w", err) + } + if _, err := tmp.Write(data); err != nil { + _ = tmp.Close() + return fmt.Errorf("writing temp config: %w", err) + } + if err := tmp.Close(); err != nil { + return fmt.Errorf("closing temp config: %w", err) + } + path, _ := Path() + if err := os.Rename(tmpName, path); err != nil { + return fmt.Errorf("replacing %s: %w", path, err) + } + return nil +} + +// Clear removes the config file (full sign-out + reset). A missing file is not +// an error. +func Clear() error { + path, err := Path() + if err != nil { + return err + } + if err := os.Remove(path); err != nil && !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("removing %s: %w", path, err) + } + return nil +} + +// SignedIn reports whether a user token is stored. +func (c *Config) SignedIn() bool { return c.Token != "" } diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..53dd368 --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,64 @@ +package config + +import ( + "os" + "testing" +) + +func TestSaveLoadRoundTrip(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + in := &Config{Env: "dev", Email: "a@b.com", Token: "tok123", ActiveClientID: "edge_1"} + if err := in.Save(); err != nil { + t.Fatal(err) + } + out, err := Load() + if err != nil { + t.Fatal(err) + } + if *out != *in { + t.Fatalf("round-trip mismatch: %+v != %+v", out, in) + } + if !out.SignedIn() { + t.Error("SignedIn should be true when a token is present") + } +} + +func TestSaveIs0600(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + if err := (&Config{Token: "secret"}).Save(); err != nil { + t.Fatal(err) + } + p, _ := Path() + fi, err := os.Stat(p) + if err != nil { + t.Fatal(err) + } + if fi.Mode().Perm() != 0o600 { + t.Errorf("config mode = %v, want 0600 (it holds a token)", fi.Mode().Perm()) + } +} + +func TestLoadMissingIsEmpty(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + c, err := Load() + if err != nil { + t.Fatalf("missing config should not error: %v", err) + } + if c.SignedIn() { + t.Errorf("missing config should be empty, got %+v", c) + } +} + +func TestClear(t *testing.T) { + t.Setenv("TRACEBLOC_CONFIG_DIR", t.TempDir()) + _ = (&Config{Token: "x"}).Save() + if err := Clear(); err != nil { + t.Fatal(err) + } + if c, _ := Load(); c.SignedIn() { + t.Error("after Clear, should not be signed in") + } + if err := Clear(); err != nil { + t.Errorf("Clear on a missing file should be nil, got %v", err) + } +} diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go new file mode 100644 index 0000000..d69ce9c --- /dev/null +++ b/internal/doctor/doctor.go @@ -0,0 +1,602 @@ +// Package doctor implements the checks behind `tracebloc cluster doctor`: +// a read-only, best-effort health sweep of a running tracebloc client +// cluster. Each check reports ✔/⚠/✖ plus a one-line remedy, so a customer +// can answer "why isn't my experiment running?" without tracebloc shelling +// into their cluster (epic tracebloc/client-runtime#116, WS3). +// +// Design mirrors the installer's preflight.sh: every check is independent +// and returns a Result instead of aborting, so one failure never hides the +// others. Network probes are injectable (Options) so the package is fully +// exercisable with client-go's fake clientset — no real cluster or egress. +package doctor + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "sort" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + + "github.com/tracebloc/cli/internal/cluster" +) + +// Status is a single check's severity. Ordered so the numerically-greatest +// status is the worst — see Worst. +type Status int + +const ( + StatusOK Status = iota + StatusWarn + StatusFail +) + +func (s Status) String() string { + switch s { + case StatusOK: + return "ok" + case StatusWarn: + return "warn" + case StatusFail: + return "fail" + default: + return "unknown" + } +} + +// Result is one check's outcome. Remedy is shown to the customer only when +// Status is not OK. +type Result struct { + Name string + Status Status + Detail string + Remedy string +} + +// Worst returns the most severe status across results (StatusOK for none). +// The doctor command maps a non-OK worst to a non-zero exit code. +func Worst(results []Result) Status { + worst := StatusOK + for _, r := range results { + if r.Status > worst { + worst = r.Status + } + } + return worst +} + +// Conservative tunables, kept as package consts to avoid false positives on a +// busy cluster. If a check ever needs runtime tuning, thread it through Options +// (like HTTPProbe) rather than making these package vars. +const ( + pendingGrace = 5 * time.Minute // a pod Pending longer than this is flagged + httpProbeTimeout = 8 * time.Second +) + +// Options configures a diagnosis run. The zero value is usable: Namespace +// defaults are the caller's concern (it passes the resolved namespace), and +// HTTPProbe falls back to the real proxy-aware prober. +type Options struct { + Namespace string + + // HTTPProbe reports whether a URL is reachable from where the CLI runs. + // nil => httpProbe (proxy-aware, short timeout). Injected in tests. + HTTPProbe func(ctx context.Context, url string) error +} + +// Run executes every check in display order and returns their results. It +// never returns an error: an unreachable cluster or a failing probe is a +// Result, not a Go error — the command renders all of them and derives the +// exit code from Worst. +func Run(ctx context.Context, cs kubernetes.Interface, opts Options) []Result { + if opts.HTTPProbe == nil { + opts.HTTPProbe = httpProbe + } + ns := opts.Namespace + + // Discovered once and shared: the parent release gates nothing (every + // check still runs and reports), but the later checks reuse it. + release, relErr := cluster.DiscoverParentRelease(ctx, cs, ns) + jmEnv := jobsManagerEnv(ctx, cs, ns, release) + + return []Result{ + checkReachable(release, relErr, ns), + checkPods(ctx, cs, ns), + checkPVC(ctx, cs, ns), + checkNodeFit(ctx, cs, jmEnv), + checkImagePull(ctx, cs, ns, release), + checkProxy(jmEnv), + checkBackendEgress(ctx, jmEnv, opts.HTTPProbe), + checkRequestsProxy(ctx, cs, ns, release), + } +} + +// checkReachable confirms the API answered and the parent client chart is +// installed here. It's the gate the customer reads first; the rest still run. +func checkReachable(release *cluster.ParentRelease, err error, ns string) Result { + const name = "Cluster reachable" + if err != nil { + return Result{ + Name: name, + Status: StatusFail, + Detail: err.Error(), + Remedy: "Check your kubeconfig/context and that the tracebloc client chart is installed here: kubectl get deploy -n " + ns, + } + } + return Result{ + Name: name, + Status: StatusOK, + Detail: fmt.Sprintf("release %q, chart %s, appVersion %s (namespace %s)", + release.ReleaseName, release.ChartVersion, release.AppVersion, ns), + } +} + +// checkPods flags crash-looping or long-Pending pods — the local complement +// to the controller's crash-loop detection (client-runtime#117). Conservative +// thresholds keep a transient restart or a briefly-Pending job from tripping it. +func checkPods(ctx context.Context, cs kubernetes.Interface, ns string) Result { + const name = "Pod health" + pods, err := cs.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return Result{ + Name: name, + Status: StatusWarn, + Detail: "could not list pods: " + err.Error(), + Remedy: "Ensure your kubeconfig user can list pods in " + ns + ".", + } + } + + var crashing, pending []string + for _, p := range pods.Items { + if podCrashLooping(p) { + crashing = append(crashing, p.Name) + continue + } + if p.Status.Phase == corev1.PodPending && + time.Since(p.CreationTimestamp.Time) > pendingGrace { + pending = append(pending, p.Name) + } + } + sort.Strings(crashing) + sort.Strings(pending) + + switch { + case len(crashing) > 0: + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("crash-looping: %v", crashing), + Remedy: "Inspect the container's own logs: kubectl logs -n " + ns + " --previous", + } + case len(pending) > 0: + return Result{ + Name: name, + Status: StatusWarn, + Detail: fmt.Sprintf("Pending > %s: %v", pendingGrace, pending), + Remedy: "kubectl describe pod -n " + ns + " — usually unschedulable (resources) or an image pull issue.", + } + default: + return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d pod(s), none crash-looping or stuck Pending", len(pods.Items))} + } +} + +// podCrashLooping reports whether a pod has a container actively stuck in +// CrashLoopBackOff — the state Kubernetes sets for a container that keeps +// crashing. Both init AND app containers are checked: an init container in +// CrashLoopBackOff keeps the pod Pending and blocks startup silently (Bugbot +// on PR #89). +// +// We deliberately do NOT infer crash-looping from RestartCount: a high count +// is equally produced by a pod that recovered on retry, a job that retried +// before Succeeding, or a completed init container — all healthy (Bugbot on +// PR #89; cf. the controller's recovered-container fix, client-runtime#117). +// The terminal-phase guard is belt-and-suspenders: a Succeeded/Failed pod has +// no waiting containers anyway. +func podCrashLooping(p corev1.Pod) bool { + if p.Status.Phase == corev1.PodSucceeded || p.Status.Phase == corev1.PodFailed { + return false + } + for _, group := range [][]corev1.ContainerStatus{p.Status.InitContainerStatuses, p.Status.ContainerStatuses} { + for _, c := range group { + if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" { + return true + } + } + } + return false +} + +// checkPVC reuses cluster.DiscoverSharedPVC — which already verifies the +// shared-data PVC exists and is Bound, with actionable errors. +func checkPVC(ctx context.Context, cs kubernetes.Interface, ns string) Result { + const name = "Dataset volume (PVC)" + pvc, err := cluster.DiscoverSharedPVC(ctx, cs, ns) + if err != nil { + return Result{ + Name: name, + Status: StatusFail, + Detail: err.Error(), + Remedy: "Check the cluster has a usable StorageClass (kubectl get sc) and the PVC is Bound (kubectl get pvc -n " + ns + ").", + } + } + return Result{ + Name: name, + Status: StatusOK, + Detail: fmt.Sprintf("%s Bound, mounted at %s", pvc.ClaimName, pvc.MountPath), + } +} + +// checkProxy surfaces the in-cluster proxy wiring read from jobs-manager's +// env — the corporate-proxy propagation that, when missing, silently strands +// egress. Informational: present config is ✔; a missing requests-proxy URL is +// the only genuine anomaly. +func checkProxy(env map[string]string) Result { + const name = "Proxy configuration" + rp := env["REQUESTS_PROXY_URL"] + if rp == "" { + return Result{ + Name: name, + Status: StatusWarn, + // jobsManagerEnv reads only literal env values, so a chart that sets + // REQUESTS_PROXY_URL via a configMap/secret ref reads as empty here — + // called out in the remedy so a ref-based install isn't mistaken for + // missing wiring. + Detail: "jobs-manager has no literal REQUESTS_PROXY_URL (chart too old, or it's set via a configMap/secret ref)", + Remedy: "Verify the requests-proxy is wired: kubectl set env deploy/-jobs-manager --list | grep PROXY", + } + } + detail := "requests-proxy=" + rp + if eg := env["EGRESS_PROXY_URL"]; eg != "" { + detail += ", egress-proxy=" + eg + } + if env["HTTPS_PROXY"] != "" || env["HTTP_PROXY"] != "" { + detail += ", corporate HTTP(S)_PROXY set" + } else { + detail += ", no corporate HTTP(S)_PROXY" + } + return Result{Name: name, Status: StatusOK, Detail: detail} +} + +// checkBackendEgress probes the tracebloc backend API. Honest scope: this +// runs from the machine the CLI is on, NOT from inside the cluster — the +// cluster egresses via its egress-proxy. An in-cluster probe is the WS3 +// follow-up; this still catches a customer network/proxy that can't reach +// the backend at all. +func checkBackendEgress(ctx context.Context, env map[string]string, probe func(context.Context, string) error) Result { + const name = "Backend egress (from this machine)" + host := backendHost(env["CLIENT_ENV"]) + url := "https://" + host + "/" + if err := probe(ctx, url); err != nil { + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("%s unreachable: %v", host, err), + Remedy: "Check this machine's network/proxy to " + host + ". The cluster egresses via its egress-proxy, so this is indicative, not definitive.", + } + } + return Result{Name: name, Status: StatusOK, Detail: host + " reachable"} +} + +// backendHost maps CLIENT_ENV to the backend API host, mirroring the edge +// runtime's own mapping (controller.py). Unset/unknown defaults to prod, the +// chart's CLIENT_ENV default. +func backendHost(clientEnv string) string { + // Normalize like the API client (api.ResolveEnv/BaseURL lower-case), so a + // non-lowercase CLIENT_ENV on the edge box doesn't fall through to prod and + // make the doctor probe the wrong backend. + switch strings.ToLower(strings.TrimSpace(clientEnv)) { + case "dev": + return "dev-api.tracebloc.io" + case "stg": + return "stg-api.tracebloc.io" + default: + return "api.tracebloc.io" + } +} + +// checkRequestsProxy verifies the requests-proxy deployment — the in-cluster +// broker for experiment egress (the Service Bus "experiments" queue) — is +// present and Ready. While it's down, experiments egress fails and the +// experiment silently stays Pending, the exact class this epic targets. +func checkRequestsProxy(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result { + const name = "Service Bus egress (requests-proxy)" + dep := findDeployment(ctx, cs, ns, release, "requests-proxy") + if dep == nil { + return Result{ + Name: name, + Status: StatusFail, + Detail: "requests-proxy deployment not found", + Remedy: "The requests-proxy brokers experiment (Service Bus) egress; without it experiments stay Pending. Reinstall/upgrade the client chart.", + } + } + if dep.Status.ReadyReplicas < 1 { + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("requests-proxy not ready (%d/%d replicas)", dep.Status.ReadyReplicas, dep.Status.Replicas), + Remedy: "Experiments egress flows through requests-proxy; while it's down they stay Pending. kubectl describe deploy " + dep.Name + " -n " + ns, + } + } + return Result{Name: name, Status: StatusOK, Detail: "requests-proxy ready (brokers the 'experiments' queue)"} +} + +// checkNodeFit verifies at least one Ready node can satisfy the resource +// requests the jobs-manager stamps on spawned training jobs (RESOURCE_REQUESTS +// / GPU_REQUESTS env) — the "Pending forever, no node big enough" class. GPU is +// soft: when a GPU is requested but no node exposes it, that's a ⚠ (jobs-manager +// has a GPU→CPU fallback), not a hard failure. +func checkNodeFit(ctx context.Context, cs kubernetes.Interface, env map[string]string) Result { + const name = "Node capacity" + cpuReq, memReq, ok := parseCPUMem(env["RESOURCE_REQUESTS"]) + if !ok { + return Result{ + Name: name, + Status: StatusWarn, + Detail: "couldn't read RESOURCE_REQUESTS from jobs-manager — skipping node-fit", + Remedy: "kubectl set env deploy/-jobs-manager --list | grep RESOURCE_REQUESTS", + } + } + gpuName, gpuReq, gpuRequested := parseGPU(env["GPU_REQUESTS"]) + + nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return Result{ + Name: name, + Status: StatusWarn, + Detail: "could not list nodes: " + err.Error(), + Remedy: "Ensure your kubeconfig user can list nodes.", + } + } + + req := fmt.Sprintf("cpu=%s, memory=%s", cpuReq.String(), memReq.String()) + if gpuRequested { + req += fmt.Sprintf(", %s=%s", gpuName, gpuReq.String()) + } + + // A pod gets ALL its requested resources from ONE node, so evaluate each + // node as a whole — never OR cpu/mem and GPU across different nodes, which + // would pass even when no single node can run the job (Bugbot on PR #91). + var cpuMemFits, fullFits bool + for i := range nodes.Items { + n := nodes.Items[i] + if !nodeReady(n) { + continue + } + alloc := n.Status.Allocatable + nodeCPUMem := alloc.Cpu().Cmp(cpuReq) >= 0 && alloc.Memory().Cmp(memReq) >= 0 + nodeGPU := !gpuRequested + if gpuRequested { + if q, present := alloc[gpuName]; present && q.Cmp(gpuReq) >= 0 { + nodeGPU = true + } + } + if nodeCPUMem { + cpuMemFits = true + } + if nodeCPUMem && nodeGPU { + fullFits = true + } + } + + switch { + case !cpuMemFits: + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("no Ready node can fit a training job (needs %s)", req), + Remedy: "Add/resize a node to meet the job's requests, or lower RESOURCE_REQUESTS on jobs-manager.", + } + case gpuRequested && !fullFits: + return Result{ + Name: name, + Status: StatusWarn, + Detail: fmt.Sprintf("no single Ready node satisfies cpu+memory AND %s — GPU jobs rely on the CPU fallback (needs %s)", gpuName, req), + Remedy: "If GPU training is expected, ensure one node has both the compute and the GPU capacity, with its device plugin.", + } + default: + return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("a Ready node can schedule a training job (%s)", req)} + } +} + +// checkImagePull verifies that any registry pull secret the jobs-manager +// references exists and is a well-formed dockerconfigjson — so private-image +// pulls don't ImagePullBackOff. (Bad-but-well-formed credentials can't be +// detected without an actual pull; this catches a missing/empty/malformed +// secret, the common misconfig.) +func checkImagePull(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) Result { + const name = "Image pull secret" + dep := findDeployment(ctx, cs, ns, release, "jobs-manager") + if dep == nil { + return Result{ + Name: name, + Status: StatusWarn, + Detail: "couldn't read jobs-manager to resolve image pull secrets — skipping", + Remedy: "Check the parent client release is installed in " + ns + ".", + } + } + secrets := dep.Spec.Template.Spec.ImagePullSecrets + if len(secrets) == 0 { + return Result{Name: name, Status: StatusOK, Detail: "no image pull secret in use (public/digest-pinned images)"} + } + for _, ref := range secrets { + sec, err := cs.CoreV1().Secrets(ns).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("image pull secret %q not found", ref.Name), + Remedy: "Private image pulls will ImagePullBackOff. Reinstall the chart with valid registry credentials.", + } + } + if sec.Type != corev1.SecretTypeDockerConfigJson { + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("secret %q is type %q, not %s", ref.Name, sec.Type, corev1.SecretTypeDockerConfigJson), + Remedy: "Recreate it as a docker-registry secret (kubectl create secret docker-registry).", + } + } + if data := sec.Data[corev1.DockerConfigJsonKey]; len(data) == 0 || !json.Valid(data) { + return Result{ + Name: name, + Status: StatusFail, + Detail: fmt.Sprintf("secret %q has an empty or malformed %s", ref.Name, corev1.DockerConfigJsonKey), + Remedy: "Recreate the registry secret; its .dockerconfigjson isn't valid JSON.", + } + } + } + return Result{Name: name, Status: StatusOK, Detail: fmt.Sprintf("%d image pull secret(s) present and well-formed", len(secrets))} +} + +// parseResourceSpec parses jobs-manager's "k1=v1,k2=v2" resource env into a map. +func parseResourceSpec(spec string) map[string]string { + out := map[string]string{} + for _, part := range strings.Split(spec, ",") { + kv := strings.SplitN(strings.TrimSpace(part), "=", 2) + if len(kv) == 2 && strings.TrimSpace(kv[0]) != "" { + out[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return out +} + +// parseCPUMem extracts the cpu + memory quantities from a RESOURCE_REQUESTS +// spec; ok is false unless both are present and parseable. +func parseCPUMem(spec string) (cpu, mem resource.Quantity, ok bool) { + m := parseResourceSpec(spec) + c, cOK := m["cpu"] + mm, mOK := m["memory"] + if !cOK || !mOK { + return resource.Quantity{}, resource.Quantity{}, false + } + cpu, errC := resource.ParseQuantity(c) + mem, errM := resource.ParseQuantity(mm) + if errC != nil || errM != nil { + return resource.Quantity{}, resource.Quantity{}, false + } + return cpu, mem, true +} + +// parseGPU extracts the GPU resource name + quantity from a GPU_REQUESTS spec +// (e.g. "nvidia.com/gpu=1"). requested is false when absent, unparseable, or 0. +func parseGPU(spec string) (name corev1.ResourceName, qty resource.Quantity, requested bool) { + for k, v := range parseResourceSpec(spec) { + q, err := resource.ParseQuantity(v) + if err == nil && !q.IsZero() { + return corev1.ResourceName(k), q, true + } + } + return "", resource.Quantity{}, false +} + +// nodeReady reports whether a node's Ready condition is True. +func nodeReady(n corev1.Node) bool { + for _, c := range n.Status.Conditions { + if c.Type == corev1.NodeReady { + return c.Status == corev1.ConditionTrue + } + } + return false +} + +// jobsManagerEnv reads jobs-manager's first-container plain env into a map +// (valueFrom entries have no literal value and are skipped). Best-effort: +// returns an empty map when the deployment can't be fetched. +func jobsManagerEnv(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease) map[string]string { + env := map[string]string{} + dep := findDeployment(ctx, cs, ns, release, "jobs-manager") + if dep == nil || len(dep.Spec.Template.Spec.Containers) == 0 { + return env + } + for _, e := range dep.Spec.Template.Spec.Containers[0].Env { + if e.Value != "" { + env[e.Name] = e.Value + } + } + return env +} + +// getDeployment returns the first of candidates that exists, or nil. +func getDeployment(ctx context.Context, cs kubernetes.Interface, ns string, candidates []string) *appsv1.Deployment { + for _, n := range candidates { + if n == "" { + continue + } + d, err := cs.AppsV1().Deployments(ns).Get(ctx, n, metav1.GetOptions{}) + if err == nil { + return d + } + } + return nil +} + +// findDeployment locates a chart component's Deployment ("", e.g. +// "requests-proxy"), tied to the discovered release so a check can never be +// satisfied by a DIFFERENT release's component or a stray bare one (Bugbot on +// PR #89). +// +// Release known: take the chart's standard "-" name, or a bare +// "" ONLY when its app.kubernetes.io/instance label ties it to this +// release (older unprefixed charts). A deployment belonging to another release +// is never accepted — if this release's component is missing, return nil and let +// the check report it. +// +// Release unknown (discovery failed): match by name suffix, but only when +// EXACTLY ONE deployment carries it. With several (multiple releases, which +// DiscoverParentRelease refuses to disambiguate) there's no safe attribution, so +// return nil rather than guess — which would let different checks describe +// different releases in one run. +func findDeployment(ctx context.Context, cs kubernetes.Interface, ns string, release *cluster.ParentRelease, suffix string) *appsv1.Deployment { + if release != nil && release.ReleaseName != "" { + if d := getDeployment(ctx, cs, ns, []string{release.ReleaseName + "-" + suffix}); d != nil { + return d + } + if d := getDeployment(ctx, cs, ns, []string{suffix}); d != nil && + d.Labels["app.kubernetes.io/instance"] == release.ReleaseName { + return d + } + return nil + } + + deps, err := cs.AppsV1().Deployments(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil + } + var match *appsv1.Deployment + for i := range deps.Items { + if n := deps.Items[i].Name; n == suffix || strings.HasSuffix(n, "-"+suffix) { + if match != nil { + return nil // ambiguous across releases — don't guess + } + match = &deps.Items[i] + } + } + return match +} + +// httpProbe is the default backend prober: a GET with a short timeout over +// Go's default transport, which honors HTTP(S)_PROXY/NO_PROXY via +// http.ProxyFromEnvironment. Any HTTP response (even 4xx/5xx) means the host +// is reachable — we're testing connectivity, not the endpoint's health. +func httpProbe(ctx context.Context, url string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return err + } + client := &http.Client{Timeout: httpProbeTimeout} + resp, err := client.Do(req) + if err != nil { + return err + } + // Connected — the host is reachable regardless of status code. Discard the + // close error so a rare post-connect close failure isn't reported as "down". + _ = resp.Body.Close() + return nil +} diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go new file mode 100644 index 0000000..7eaaa22 --- /dev/null +++ b/internal/doctor/doctor_test.go @@ -0,0 +1,501 @@ +package doctor + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + + "github.com/tracebloc/cli/internal/cluster" +) + +const ns = "tracebloc" + +func bg() context.Context { return context.Background() } + +// jobsManagerDep mirrors the chart labels DiscoverParentRelease keys off +// (see internal/cluster/discover_test.go) so the fake clientset discovers it. +func jobsManagerDep(release string, env ...corev1.EnvVar) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: release + "-jobs-manager", + Namespace: ns, + Labels: map[string]string{ + "app.kubernetes.io/name": "client", + "app.kubernetes.io/instance": release, + "app.kubernetes.io/managed-by": "Helm", + "app.kubernetes.io/version": "1.3.5", + "helm.sh/chart": "client-1.3.5", + }, + }, + Spec: appsv1.DeploymentSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "api", Env: env}}, + }, + }, + }, + } +} + +func requestsProxyDep(release string, ready int32) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: release + "-requests-proxy", Namespace: ns}, + Status: appsv1.DeploymentStatus{Replicas: 1, ReadyReplicas: ready}, + } +} + +func boundPVC() *corev1.PersistentVolumeClaim { + return &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{Name: cluster.SharedPVCClaimName, Namespace: ns}, + Spec: corev1.PersistentVolumeClaimSpec{AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}}, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + } +} + +func runningPod(name string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: 0}}, + }, + } +} + +func crashPod(name string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "c", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }}, + }, + } +} + +// succeededPod is a finished job pod that retried before completing — a high +// RestartCount here is historical, not a current crash-loop (Bugbot on #89). +func succeededPod(name string, restarts int32) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + ContainerStatuses: []corev1.ContainerStatus{{Name: "c", RestartCount: restarts}}, + }, + } +} + +// recoveredPod restarted several times but its container is running again now — +// recovered, not crash-looping (cf. controller recovered-container fix, #117). +func recoveredPod(name string, restarts int32) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "c", + RestartCount: restarts, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}, + }, + } +} + +func pendingPod(name string, age time.Duration) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + CreationTimestamp: metav1.NewTime(time.Now().Add(age)), + }, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + } +} + +// initCrashPod has an init container stuck in CrashLoopBackOff — the pod stays +// Pending and cannot start, so it must read as a failure, not a Pending warning +// (Bugbot on #89). +func initCrashPod(name string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + InitContainerStatuses: []corev1.ContainerStatus{{ + Name: "init", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }}, + }, + } +} + +func TestWorst(t *testing.T) { + if got := Worst(nil); got != StatusOK { + t.Fatalf("Worst(nil) = %v, want ok", got) + } + rs := []Result{{Status: StatusOK}, {Status: StatusFail}, {Status: StatusWarn}} + if got := Worst(rs); got != StatusFail { + t.Fatalf("Worst = %v, want fail", got) + } +} + +func TestCheckReachable(t *testing.T) { + if r := checkReachable(nil, errors.New("boom"), ns); r.Status != StatusFail { + t.Fatalf("error => %v, want fail", r.Status) + } + rel := &cluster.ParentRelease{ReleaseName: "tb", ChartVersion: "1.3.5", AppVersion: "1.3.5"} + r := checkReachable(rel, nil, ns) + if r.Status != StatusOK || !strings.Contains(r.Detail, "tb") { + t.Fatalf("release => %v / %q, want ok mentioning the release", r.Status, r.Detail) + } +} + +func TestCheckPods(t *testing.T) { + tests := []struct { + name string + pod *corev1.Pod + want Status + }{ + {"healthy", runningPod("ok"), StatusOK}, + {"crash-loop", crashPod("bad"), StatusFail}, + {"pending-old", pendingPod("stuck", -10*time.Minute), StatusWarn}, + {"pending-fresh", pendingPod("fresh", -time.Minute), StatusOK}, + {"succeeded-high-restarts", succeededPod("done", 5), StatusOK}, + {"recovered-running", recoveredPod("recovered", 5), StatusOK}, + {"init-crash-loop", initCrashPod("initbad"), StatusFail}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + cs := fake.NewClientset(tc.pod) + if r := checkPods(bg(), cs, ns); r.Status != tc.want { + t.Fatalf("checkPods = %v (%q), want %v", r.Status, r.Detail, tc.want) + } + }) + } +} + +func TestCheckPVC(t *testing.T) { + if r := checkPVC(bg(), fake.NewClientset(boundPVC()), ns); r.Status != StatusOK { + t.Fatalf("bound PVC => %v, want ok", r.Status) + } + if r := checkPVC(bg(), fake.NewClientset(), ns); r.Status != StatusFail { + t.Fatalf("missing PVC => %v, want fail", r.Status) + } +} + +func TestCheckProxy(t *testing.T) { + tests := []struct { + name string + env map[string]string + want Status + substr string + }{ + {"requests-proxy set", map[string]string{"REQUESTS_PROXY_URL": "http://requests-proxy-service:8888"}, StatusOK, "requests-proxy="}, + {"corporate proxy", map[string]string{"REQUESTS_PROXY_URL": "http://x", "HTTPS_PROXY": "http://corp:3128"}, StatusOK, "corporate HTTP(S)_PROXY set"}, + {"empty", map[string]string{}, StatusWarn, "REQUESTS_PROXY_URL"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + r := checkProxy(tc.env) + if r.Status != tc.want || !strings.Contains(r.Detail, tc.substr) { + t.Fatalf("checkProxy = %v / %q, want %v containing %q", r.Status, r.Detail, tc.want, tc.substr) + } + }) + } +} + +func TestCheckBackendEgress(t *testing.T) { + okProbe := func(context.Context, string) error { return nil } + failProbe := func(context.Context, string) error { return errors.New("dns failure") } + + if r := checkBackendEgress(bg(), map[string]string{"CLIENT_ENV": "dev"}, okProbe); r.Status != StatusOK || !strings.Contains(r.Detail, "dev-api.tracebloc.io") { + t.Fatalf("reachable dev => %v / %q", r.Status, r.Detail) + } + if r := checkBackendEgress(bg(), map[string]string{}, failProbe); r.Status != StatusFail || !strings.Contains(r.Detail, "api.tracebloc.io") { + t.Fatalf("unreachable default => %v / %q", r.Status, r.Detail) + } +} + +func TestBackendHost(t *testing.T) { + tests := map[string]string{ + "dev": "dev-api.tracebloc.io", + "stg": "stg-api.tracebloc.io", + "prod": "api.tracebloc.io", + "": "api.tracebloc.io", + "weird": "api.tracebloc.io", + // Case/space-insensitive, matching the API client's env resolution — a + // non-lowercase CLIENT_ENV must not fall through to prod. + "DEV": "dev-api.tracebloc.io", + "Stg": "stg-api.tracebloc.io", + " dev ": "dev-api.tracebloc.io", + } + for in, want := range tests { + if got := backendHost(in); got != want { + t.Errorf("backendHost(%q) = %q, want %q", in, got, want) + } + } +} + +func TestCheckRequestsProxy(t *testing.T) { + rel := &cluster.ParentRelease{ReleaseName: "tb"} + tests := []struct { + name string + dep *appsv1.Deployment // nil => deployment absent + want Status + }{ + {"ready", requestsProxyDep("tb", 1), StatusOK}, + {"not-ready", requestsProxyDep("tb", 0), StatusFail}, + {"missing", nil, StatusFail}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + cs := fake.NewClientset() + if tc.dep != nil { + cs = fake.NewClientset(tc.dep) + } + if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != tc.want { + t.Fatalf("checkRequestsProxy = %v (%q), want %v", r.Status, r.Detail, tc.want) + } + }) + } +} + +// When DiscoverParentRelease failed (release nil) but a release-prefixed +// requests-proxy exists, the suffix fallback must still find it rather than +// falsely report it missing (Bugbot on #89). +func TestCheckRequestsProxy_NilReleaseFindsPrefixed(t *testing.T) { + cs := fake.NewClientset(requestsProxyDep("tb", 1)) // "tb-requests-proxy" + if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status != StatusOK { + t.Fatalf("nil release with prefixed deploy => %v (%q), want ok", r.Status, r.Detail) + } +} + +// With multiple parent releases in one namespace (the case DiscoverParentRelease +// refuses) and no discovered release, the suffix fallback must NOT pick one +// arbitrarily — guessing could let different checks describe different releases +// in a single run (Bugbot on #89). It should report can't-determine, not OK. +func TestCheckRequestsProxy_NilReleaseAmbiguous(t *testing.T) { + cs := fake.NewClientset( + requestsProxyDep("relA", 1), // "relA-requests-proxy" + requestsProxyDep("relB", 1), // "relB-requests-proxy" + ) + if r := checkRequestsProxy(bg(), cs, ns, nil); r.Status == StatusOK { + t.Fatalf("ambiguous multi-release => %v (%q), want not-OK (no guessing)", r.Status, r.Detail) + } +} + +// With a release discovered, the check must be tied to THAT release: another +// release's requests-proxy must not be accepted as the discovered release's, +// or relA goes green on relB's proxy while relA's is actually missing +// (Bugbot on #89). +func TestCheckRequestsProxy_DiscoveredReleaseIgnoresOtherReleases(t *testing.T) { + rel := &cluster.ParentRelease{ReleaseName: "relA"} // relA has no requests-proxy + cs := fake.NewClientset(requestsProxyDep("relB", 1)) + if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status == StatusOK { + t.Fatalf("relA proxy missing, relB present => %v (%q), want not-OK", r.Status, r.Detail) + } +} + +// A bare (unprefixed) requests-proxy is accepted only when its instance label +// ties it to the discovered release — covering older unprefixed charts. +func TestCheckRequestsProxy_BareNameAcceptedWhenLabelledForRelease(t *testing.T) { + rel := &cluster.ParentRelease{ReleaseName: "relA"} + bare := requestsProxyDep("relA", 1) + bare.Name = "requests-proxy" + bare.Labels = map[string]string{"app.kubernetes.io/instance": "relA"} + cs := fake.NewClientset(bare) + if r := checkRequestsProxy(bg(), cs, ns, rel); r.Status != StatusOK { + t.Fatalf("bare requests-proxy labelled for relA => %v (%q), want ok", r.Status, r.Detail) + } +} + +func TestRun_HealthyCluster(t *testing.T) { + const rel = "tb" + cs := fake.NewClientset( + jobsManagerDep(rel, + corev1.EnvVar{Name: "REQUESTS_PROXY_URL", Value: "http://requests-proxy-service:8888"}, + corev1.EnvVar{Name: "CLIENT_ENV", Value: "dev"}, + corev1.EnvVar{Name: "RESOURCE_REQUESTS", Value: "cpu=2,memory=8Gi"}, + ), + requestsProxyDep(rel, 1), + boundPVC(), + runningPod("tb-jobs-manager-abc"), + node("n1", "4", "16Gi"), + ) + + results := Run(bg(), cs, Options{ + Namespace: ns, + HTTPProbe: func(context.Context, string) error { return nil }, + }) + + if len(results) != 8 { + t.Fatalf("want 8 checks, got %d", len(results)) + } + if w := Worst(results); w != StatusOK { + for _, r := range results { + t.Logf("%-32s %-4s %s", r.Name, r.Status, r.Detail) + } + t.Fatalf("healthy cluster worst = %v, want ok", w) + } +} + +func node(name, cpu, mem string, gpu ...string) *corev1.Node { + alloc := corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse(cpu), + corev1.ResourceMemory: resource.MustParse(mem), + } + if len(gpu) == 2 { + alloc[corev1.ResourceName(gpu[0])] = resource.MustParse(gpu[1]) + } + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: corev1.NodeStatus{ + Allocatable: alloc, + Conditions: []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionTrue}}, + }, + } +} + +func TestParseCPUMem(t *testing.T) { + cpu, mem, ok := parseCPUMem("cpu=2,memory=8Gi") + if !ok || cpu.String() != "2" || mem.String() != "8Gi" { + t.Fatalf("parseCPUMem => %q %q %v", cpu.String(), mem.String(), ok) + } + if _, _, ok := parseCPUMem("cpu=2"); ok { + t.Fatalf("missing memory should be !ok") + } + if _, _, ok := parseCPUMem("cpu=abc,memory=8Gi"); ok { + t.Fatalf("unparseable cpu should be !ok") + } +} + +func TestParseGPU(t *testing.T) { + name, qty, req := parseGPU("nvidia.com/gpu=1") + if !req || string(name) != "nvidia.com/gpu" || qty.String() != "1" { + t.Fatalf("parseGPU => %q %q %v", name, qty.String(), req) + } + if _, _, req := parseGPU("nvidia.com/gpu=0"); req { + t.Fatalf("zero gpu should be !requested") + } + if _, _, req := parseGPU(""); req { + t.Fatalf("empty should be !requested") + } +} + +func TestCheckNodeFit(t *testing.T) { + full := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi", "GPU_REQUESTS": "nvidia.com/gpu=1"} + cpuOnly := map[string]string{"RESOURCE_REQUESTS": "cpu=2,memory=8Gi"} + + t.Run("fits cpu+mem+gpu", func(t *testing.T) { + cs := fake.NewClientset(node("n1", "4", "16Gi", "nvidia.com/gpu", "2")) + if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK { + t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail) + } + }) + t.Run("no node big enough -> fail", func(t *testing.T) { + cs := fake.NewClientset(node("n1", "1", "2Gi")) + if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail { + t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail) + } + }) + t.Run("gpu requested but none -> warn", func(t *testing.T) { + cs := fake.NewClientset(node("n1", "4", "16Gi")) // cpu/mem fit, no gpu + if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn { + t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail) + } + }) + t.Run("cpu+mem and gpu on different nodes -> warn, not ok", func(t *testing.T) { + // The Bugbot #91 case: one node fits cpu/mem, a different node has the + // GPU but is too small. No single node runs a GPU job → must NOT be ok. + cs := fake.NewClientset( + node("big", "4", "16Gi"), // cpu/mem, no gpu + node("gpu", "1", "1Gi", "nvidia.com/gpu", "2"), // gpu, too small + ) + if r := checkNodeFit(bg(), cs, full); r.Status != StatusWarn { + t.Fatalf("=> %v (%q), want warn (no single node fits all)", r.Status, r.Detail) + } + }) + t.Run("single node fits cpu+mem+gpu -> ok", func(t *testing.T) { + cs := fake.NewClientset( + node("big", "4", "16Gi"), // distractor: cpu/mem only + node("full", "4", "16Gi", "nvidia.com/gpu", "1"), // satisfies everything + ) + if r := checkNodeFit(bg(), cs, full); r.Status != StatusOK { + t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail) + } + }) + t.Run("not-ready node doesn't count -> fail", func(t *testing.T) { + n := node("n1", "8", "32Gi") + n.Status.Conditions = []corev1.NodeCondition{{Type: corev1.NodeReady, Status: corev1.ConditionFalse}} + cs := fake.NewClientset(n) + if r := checkNodeFit(bg(), cs, cpuOnly); r.Status != StatusFail { + t.Fatalf("=> %v (%q), want fail (node not ready)", r.Status, r.Detail) + } + }) + t.Run("missing RESOURCE_REQUESTS -> warn", func(t *testing.T) { + cs := fake.NewClientset(node("n1", "4", "16Gi")) + if r := checkNodeFit(bg(), cs, map[string]string{}); r.Status != StatusWarn { + t.Fatalf("=> %v (%q), want warn", r.Status, r.Detail) + } + }) +} + +func dockerSecret(name string, data []byte) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Type: corev1.SecretTypeDockerConfigJson, + Data: map[string][]byte{corev1.DockerConfigJsonKey: data}, + } +} + +func jmDepWithPullSecret(release, secretName string) *appsv1.Deployment { + d := jobsManagerDep(release) + if secretName != "" { + d.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: secretName}} + } + return d +} + +func TestCheckImagePull(t *testing.T) { + rel := &cluster.ParentRelease{ReleaseName: "tb"} + + t.Run("no pull secret -> ok", func(t *testing.T) { + cs := fake.NewClientset(jmDepWithPullSecret("tb", "")) + if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK { + t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail) + } + }) + t.Run("valid dockerconfigjson -> ok", func(t *testing.T) { + cs := fake.NewClientset( + jmDepWithPullSecret("tb", "reg"), + dockerSecret("reg", []byte(`{"auths":{}}`)), + ) + if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusOK { + t.Fatalf("=> %v (%q), want ok", r.Status, r.Detail) + } + }) + t.Run("missing secret -> fail", func(t *testing.T) { + cs := fake.NewClientset(jmDepWithPullSecret("tb", "reg")) // secret absent + if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail { + t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail) + } + }) + t.Run("malformed dockerconfigjson -> fail", func(t *testing.T) { + cs := fake.NewClientset( + jmDepWithPullSecret("tb", "reg"), + dockerSecret("reg", []byte("not json")), + ) + if r := checkImagePull(bg(), cs, ns, rel); r.Status != StatusFail { + t.Fatalf("=> %v (%q), want fail", r.Status, r.Detail) + } + }) +} diff --git a/internal/push/category.go b/internal/push/category.go index c20728a..73f0ffd 100644 --- a/internal/push/category.go +++ b/internal/push/category.go @@ -1,72 +1,156 @@ package push -// Category families. These mirror data-ingestors' +import "strings" + +// CategorySpec is the single source of truth for one task category's +// CLI-relevant rules. It mirrors data-ingestors' // tracebloc_ingestor/cli/conventions.py groupings so the CLI's -// per-category behaviour (which flags are required, which local -// layout to expect, which spec fields to emit) stays in lock-step +// per-category behaviour (which local layout to expect, which spec +// fields to emit, whether a label policy is needed) stays in lock-step // with what the ingestor actually resolves. // -// Kept as a single source of truth here rather than scattered -// string comparisons across spec.go / dataset.go. - -// imageCategories take a labels CSV + an images/ directory (plus, -// for some, extra sidecar dirs handled in later increments). -var imageCategories = map[string]bool{ - "image_classification": true, - "object_detection": true, - "keypoint_detection": true, - "semantic_segmentation": true, - "instance_segmentation": true, +// Everything category-shaped derives from the registry below — the +// family predicates, the `--category` help text, the interactive +// picker, and the push accept-gate — so the enumerations can't drift +// apart (they used to: the flag help listed 5 of 9, cli#74). +type CategorySpec struct { + // ID is the canonical category identifier; it matches the + // ingest.v1 schema enum (vendored via scripts/sync-schema.sh). + ID string + // Family selects the local layout + staging shape. + Family Family + // Label is the human-friendly name shown in the interactive picker. + Label string + // RegressionClass marks categories that predict a numeric target and + // therefore need label.policy (object label form) so the raw target + // never ships to the central backend by default. + RegressionClass bool + // CLISupported reports whether `dataset push` implements the category + // today. semantic_/instance_segmentation are known (the schema + // defines them) but not yet pushable. + CLISupported bool + // UnsupportedNote explains why a known-but-unimplemented category + // isn't available yet; surfaced by the push gate. Empty when supported. + UnsupportedNote string +} + +// Family groups categories by local layout. +type Family int + +const ( + // FamilyImage: a labels CSV + an images/ directory (plus, for some, + // extra sidecar dirs like annotations/ or masks/). + FamilyImage Family = iota + // FamilyTabular: a single CSV whose columns are described by a + // `schema` (column → SQL type) map. No sidecar files. + FamilyTabular + // FamilyText: a labels CSV + a directory of text files (texts/ for + // classification, sequences/ for masked language modeling). + FamilyText +) + +// categoryRegistry is the ordered, authoritative list of every category +// the ingest.v1 schema defines. Order is the display order for help text +// and the interactive picker (CLI-supported first, in workflow order; +// the not-yet-implemented ones last). Adding a category to the schema +// means adding it here — the parity test pins the set. +var categoryRegistry = []CategorySpec{ + {ID: "image_classification", Family: FamilyImage, Label: "Image classification", CLISupported: true}, + {ID: "object_detection", Family: FamilyImage, Label: "Object detection", CLISupported: true}, + {ID: "keypoint_detection", Family: FamilyImage, Label: "Keypoint detection", CLISupported: true}, + {ID: "text_classification", Family: FamilyText, Label: "Text classification", CLISupported: true}, + {ID: "masked_language_modeling", Family: FamilyText, Label: "Masked language modeling", CLISupported: true}, + {ID: "tabular_classification", Family: FamilyTabular, Label: "Tabular classification", CLISupported: true}, + {ID: "tabular_regression", Family: FamilyTabular, Label: "Tabular regression", RegressionClass: true, CLISupported: true}, + {ID: "time_series_forecasting", Family: FamilyTabular, Label: "Time-series forecasting", RegressionClass: true, CLISupported: true}, + {ID: "time_to_event_prediction", Family: FamilyTabular, Label: "Time-to-event prediction", RegressionClass: true, CLISupported: true}, + {ID: "semantic_segmentation", Family: FamilyImage, Label: "Semantic segmentation", CLISupported: false, + UnsupportedNote: "blocked on the ingestor's mask-sidecar support (data-ingestors#136)"}, + {ID: "instance_segmentation", Family: FamilyImage, Label: "Instance segmentation", CLISupported: false, + UnsupportedNote: "not implemented"}, + {ID: "causal_language_modeling", Family: FamilyText, Label: "Causal language modeling", CLISupported: false, + UnsupportedNote: "schema-recognized (data-ingestors#805); `tracebloc ingest` discover/build for its raw-.txt / prompt\\tcompletion `texts` layout is pending"}, + {ID: "token_classification", Family: FamilyText, Label: "Token classification", CLISupported: false, + UnsupportedNote: "schema-recognized; the CLI doesn't stage its per-token-label `texts` layout yet"}, +} + +// categoryByID indexes the registry for O(1) lookup, built once. +var categoryByID = func() map[string]CategorySpec { + m := make(map[string]CategorySpec, len(categoryRegistry)) + for _, c := range categoryRegistry { + m[c.ID] = c + } + return m +}() + +// Lookup returns the spec for a category id and whether it is known. +func Lookup(category string) (CategorySpec, bool) { + c, ok := categoryByID[category] + return c, ok +} + +// IsKnown reports whether category is a recognized task category (in the +// schema), supported by the CLI or not. +func IsKnown(category string) bool { + _, ok := categoryByID[category] + return ok } -// tabularCategories take a single CSV whose columns are described by -// a `schema` (column → SQL type) map. No sidecar files. -var tabularCategories = map[string]bool{ - "tabular_classification": true, - "tabular_regression": true, - "time_series_forecasting": true, - "time_to_event_prediction": true, +// IsCLISupported reports whether `dataset push` implements category today. +func IsCLISupported(category string) bool { return categoryByID[category].CLISupported } + +// IsImage reports whether category uses the labels.csv + images/ layout. +func IsImage(category string) bool { + c, ok := categoryByID[category] + return ok && c.Family == FamilyImage } -// regressionClassCategories predict a numeric target rather than a -// class. The schema requires the label in object form with an -// explicit `policy` so the raw target never ships to the central -// backend by default (policy=bucket bins it first). -var regressionClassCategories = map[string]bool{ - "tabular_regression": true, - "time_series_forecasting": true, - "time_to_event_prediction": true, +// IsTabular reports whether category uses the single-CSV + schema layout. +func IsTabular(category string) bool { + c, ok := categoryByID[category] + return ok && c.Family == FamilyTabular } -// textCategories take a labels CSV + a directory of text files -// (texts/ for classification, sequences/ for masked language -// modeling). masked_language_modeling additionally needs a -// tokenizer.json at the dataset root and has NO label. -var textCategories = map[string]bool{ - "text_classification": true, - "masked_language_modeling": true, +// IsText reports whether category uses the labels.csv + text-file dir layout. +func IsText(category string) bool { + c, ok := categoryByID[category] + return ok && c.Family == FamilyText } -// IsImage reports whether category uses the labels.csv + images/ -// local layout. -func IsImage(category string) bool { return imageCategories[category] } +// IsRegressionClass reports whether category predicts a numeric target and +// therefore needs label.policy (object label form). +func IsRegressionClass(category string) bool { return categoryByID[category].RegressionClass } -// IsTabular reports whether category uses the single-CSV + schema -// local layout (no sidecar files). -func IsTabular(category string) bool { return tabularCategories[category] } +// SupportedCategoryIDs returns the ids `dataset push` supports, in display +// order. Used to build the --category help, the interactive picker, and +// the accept-gate's "Supported:" lists from one place. +func SupportedCategoryIDs() []string { + ids := make([]string, 0, len(categoryRegistry)) + for _, c := range categoryRegistry { + if c.CLISupported { + ids = append(ids, c.ID) + } + } + return ids +} -// IsRegressionClass reports whether category predicts a numeric -// target and therefore needs label.policy (object label form). -func IsRegressionClass(category string) bool { return regressionClassCategories[category] } +// AllCategoryIDs returns every recognized category id, in registry order. +func AllCategoryIDs() []string { + ids := make([]string, 0, len(categoryRegistry)) + for _, c := range categoryRegistry { + ids = append(ids, c.ID) + } + return ids +} -// IsText reports whether category uses the labels.csv + text-file -// directory (texts/ or sequences/) local layout. -func IsText(category string) bool { return textCategories[category] } +// SupportedCategoriesList is the comma-joined supported ids, for help text +// and gate error messages. +func SupportedCategoriesList() string { return strings.Join(SupportedCategoryIDs(), ", ") } // TextSidecarDir returns the sidecar directory name a text category // expects: "sequences" for masked_language_modeling, "texts" for -// text_classification. (Used both as the local subdir to stage and -// the spec field to emit.) +// text_classification. (Used both as the local subdir to stage and the +// spec field to emit.) func TextSidecarDir(category string) string { if category == "masked_language_modeling" { return "sequences" diff --git a/internal/push/category_registry_test.go b/internal/push/category_registry_test.go new file mode 100644 index 0000000..17ce1f3 --- /dev/null +++ b/internal/push/category_registry_test.go @@ -0,0 +1,134 @@ +package push + +import ( + "encoding/json" + "sort" + "testing" + + "github.com/tracebloc/cli/internal/schema" +) + +// The registry is the single source of truth; these pin its contents and +// that the family predicates + the supported set all derive from it, so a +// future edit can't reintroduce the "5 of 9" drift (cli#74). + +func TestRegistryKnownCategories(t *testing.T) { + want := []string{ + "image_classification", "object_detection", "keypoint_detection", + "semantic_segmentation", "instance_segmentation", + "text_classification", "token_classification", + "masked_language_modeling", "causal_language_modeling", + "tabular_classification", "tabular_regression", + "time_series_forecasting", "time_to_event_prediction", + } + if got := AllCategoryIDs(); !equalSet(got, want) { + t.Fatalf("AllCategoryIDs() = %v, want set %v", got, want) + } + for _, id := range want { + if !IsKnown(id) { + t.Errorf("IsKnown(%q) = false, want true", id) + } + } + if IsKnown("not_a_category") { + t.Error(`IsKnown("not_a_category") = true, want false`) + } +} + +func TestSupportedCategories(t *testing.T) { + got := SupportedCategoryIDs() + if len(got) != 9 { + t.Fatalf("SupportedCategoryIDs() len = %d, want 9: %v", len(got), got) + } + for _, id := range got { + if !IsCLISupported(id) { + t.Errorf("SupportedCategoryIDs returned %q but IsCLISupported is false", id) + } + } + // segmentation + causal_language_modeling are known but not yet pushable, + // and must explain why. + for _, id := range []string{"semantic_segmentation", "instance_segmentation", "causal_language_modeling", "token_classification"} { + if !IsKnown(id) { + t.Errorf("%s should be known", id) + } + if IsCLISupported(id) { + t.Errorf("%s should not be CLI-supported yet", id) + } + if spec, _ := Lookup(id); spec.UnsupportedNote == "" { + t.Errorf("%s should carry an UnsupportedNote", id) + } + } +} + +func TestPredicatesDeriveFromRegistry(t *testing.T) { + for _, c := range categoryRegistry { + switch c.Family { + case FamilyImage: + if !IsImage(c.ID) || IsTabular(c.ID) || IsText(c.ID) { + t.Errorf("%s: predicates disagree with FamilyImage", c.ID) + } + case FamilyTabular: + if !IsTabular(c.ID) || IsImage(c.ID) || IsText(c.ID) { + t.Errorf("%s: predicates disagree with FamilyTabular", c.ID) + } + case FamilyText: + if !IsText(c.ID) || IsImage(c.ID) || IsTabular(c.ID) { + t.Errorf("%s: predicates disagree with FamilyText", c.ID) + } + } + if IsRegressionClass(c.ID) != c.RegressionClass { + t.Errorf("%s: IsRegressionClass = %v, want %v", c.ID, IsRegressionClass(c.ID), c.RegressionClass) + } + } + // An unknown category: every predicate false (no panic on missing key). + if IsImage("nope") || IsTabular("nope") || IsText("nope") || + IsRegressionClass("nope") || IsCLISupported("nope") { + t.Error("predicates should all be false for an unknown category") + } +} + +// TestRegistryCoversSchemaCategories pins registry⇄schema parity: every +// category the ingest schema accepts must be known to the registry, or a +// schema-valid `dataset push --category=X` is wrongly rejected as +// "unrecognized" (the token_classification drift, Bugbot v0.4.0 RC). The +// existing tests only pin the registry against a hand-written list, which +// stays internally consistent while drifting from the schema — this closes +// that gap. The reverse direction isn't required: the registry may carry a +// known-but-unsupported alias the v1 schema doesn't list yet (e.g. +// instance_segmentation), which is gated out before schema validation. +func TestRegistryCoversSchemaCategories(t *testing.T) { + var doc struct { + Properties struct { + Category struct { + Enum []string `json:"enum"` + } `json:"category"` + } `json:"properties"` + } + if err := json.Unmarshal(schema.V1Bytes, &doc); err != nil { + t.Fatalf("parse embedded schema: %v", err) + } + if len(doc.Properties.Category.Enum) == 0 { + t.Fatal("no category enum found in the embedded schema (parse path wrong?)") + } + for _, id := range doc.Properties.Category.Enum { + if !IsKnown(id) { + t.Errorf("schema category %q missing from the registry — `dataset push --category=%s` "+ + "would be rejected as unrecognized despite passing schema validation", id, id) + } + } +} + +func equalSet(a, b []string) bool { + if len(a) != len(b) { + return false + } + as := append([]string(nil), a...) + bs := append([]string(nil), b...) + sort.Strings(as) + sort.Strings(bs) + for i := range as { + if as[i] != bs[i] { + return false + } + } + return true +} diff --git a/internal/push/teardown.go b/internal/push/teardown.go index 5d30ccf..1c270fe 100644 --- a/internal/push/teardown.go +++ b/internal/push/teardown.go @@ -5,11 +5,11 @@ import ( "context" "fmt" "strings" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" ) // IngestionDatabase is the MySQL schema jobs-manager ingests tables @@ -49,24 +49,30 @@ type TeardownResult struct { RemovedPaths []string } -// Teardown performs the in-cluster teardown described by plan, mirroring -// the manual kubectl-exec cleanup: +// Teardown performs the in-cluster teardown described by plan: // // - DROP the MySQL table by exec-ing `mysql` inside the mysql pod, // referencing the pod's own $MYSQL_ROOT_PASSWORD — so no database // credential ever transits the CLI. -// - rm -rf the PVC dirs by exec-ing inside the jobs-manager pod, -// which mounts the shared PVC at SharedRoot. +// - rm -rf the PVC dirs from a short-lived pod that mirrors the CLI's +// stage pod (uid 65532 + fsGroup 65532, shared PVC mounted) — built +// from podOpts via BuildStagePodSpec. // -// DESIGN NOTE: this exec-into-existing-pods approach is the -// "CLI-direct teardown" (the alternative considered was a server-side -// jobs-manager delete endpoint). It assumes (a) a pod whose name -// contains "mysql" exposes $MYSQL_ROOT_PASSWORD, and (b) the -// jobs-manager pod mounts the shared PVC at SharedRoot — both true for -// the current parent chart, but worth confirming before this ships. -func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, namespace string, plan TeardownPlan) (TeardownResult, error) { +// Why an ephemeral stage-identity pod and NOT the long-lived +// jobs-manager pod (tracebloc/client#259): the staging files under +// SharedRoot/.tracebloc-staging/
are written by the stage pod as +// uid 65532 (and the ingestor's SharedRoot/
files as uid 65534). +// The jobs-manager pod runs as a different non-root uid with no shared +// fsGroup, so its `rm` hit EACCES on 65532-owned files in a +// non-group-writable directory and left orphans. A teardown pod that +// runs as the same uid that wrote the staging files OWNS them, so it +// deletes them by ownership — which works on hostPath (where fsGroup is +// a no-op, kubernetes/kubernetes#138411) and CSI alike. +// +// DESIGN NOTE: still assumes a pod whose name contains "mysql" exposes +// $MYSQL_ROOT_PASSWORD (true for the current parent chart). +func Teardown(ctx context.Context, cs kubernetes.Interface, exec Executor, namespace string, plan TeardownPlan, podOpts PodSpecOptions) (TeardownResult, error) { var res TeardownResult - exec := &SPDYExecutor{Config: cfg, Client: cs} // 1. DROP the table — mysql pod, localhost, its own root password. mysqlPod, mysqlContainer, err := findRunningPod(ctx, cs, namespace, "mysql") @@ -82,14 +88,28 @@ func Teardown(ctx context.Context, cs kubernetes.Interface, cfg *rest.Config, na } res.DroppedTable = true - // 2. rm the PVC dirs — jobs-manager pod mounts the shared PVC. - jmPod, jmContainer, err := findRunningPod(ctx, cs, namespace, "jobs-manager") + // 2. rm the PVC dirs from an ephemeral stage-identity pod (see the + // doc note above + #259). The pod owns the staging files it + // deletes, so this works on hostPath and CSI. + podOpts.Namespace = namespace + podName, err := CreateStagePod(ctx, cs, podOpts) if err != nil { - return res, fmt.Errorf("locating jobs-manager pod: %w", err) + return res, fmt.Errorf("creating teardown pod: %w", err) + } + // Always clean up the teardown pod — even if the wait/exec fails or + // the parent ctx is cancelled. Fresh ctx so the delete still reaches + // the API. Mirrors push.Stage's deferred cleanup. + defer func() { + delCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _ = DeleteStagePod(delCtx, cs, namespace, podName) + }() + if _, err := WaitForStagePodReady(ctx, cs, namespace, podName); err != nil { + return res, fmt.Errorf("waiting for teardown pod: %w", err) } stderr.Reset() rmCmd := append([]string{"rm", "-rf"}, plan.PVCPaths...) - if err := exec.Exec(ctx, namespace, jmPod, jmContainer, rmCmd, nil, nil, &stderr); err != nil { + if err := exec.Exec(ctx, namespace, podName, "stage", rmCmd, nil, nil, &stderr); err != nil { return res, fmt.Errorf("removing PVC paths: %w%s", err, stderrSuffix(&stderr)) } res.RemovedPaths = plan.PVCPaths diff --git a/internal/push/teardown_test.go b/internal/push/teardown_test.go index a3e9444..d7275d7 100644 --- a/internal/push/teardown_test.go +++ b/internal/push/teardown_test.go @@ -1,6 +1,15 @@ package push -import "testing" +import ( + "context" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) // TestPlanTeardown pins the artifact set `dataset rm` targets: the // MySQL table in IngestionDatabase + both PVC dirs (final dest + @@ -28,3 +37,81 @@ func TestPlanTeardown(t *testing.T) { } } } + +// TestTeardown_RemovesViaStageIdentityPod is the regression test for +// tracebloc/client#259: `dataset rm` must NOT run the file `rm` inside +// the long-lived jobs-manager pod (a non-root uid that cannot delete +// the uid-65532-owned staging files). It must run it in a short-lived +// pod that mirrors the stage pod's identity (uid 65532 + fsGroup 65532), +// which owns the staging files and so can delete them on any volume type +// (hostPath included, where fsGroup is a no-op). +func TestTeardown_RemovesViaStageIdentityPod(t *testing.T) { + // A running "mysql" pod must exist so step 1 (DROP TABLE) can locate + // it. The teardown pod is created by Teardown itself and marked Ready + // by the reactor (shared with stage_test.go). + cs := fake.NewClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "mysql-0", Namespace: "tracebloc"}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "mysql"}}}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + }) + readyOnNextGet(cs) + fe := &fakeExecutor{} + + plan := PlanTeardown("reg_train") + res, err := Teardown(context.Background(), cs, fe, "tracebloc", plan, PodSpecOptions{ + Namespace: "tracebloc", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Table: "reg_train", + }) + if err != nil { + t.Fatalf("Teardown: %v", err) + } + if !res.DroppedTable { + t.Error("DroppedTable = false, want true") + } + + // The rm is the LAST Exec call. It must target the ephemeral stage + // pod, NOT a jobs-manager pod — that's the #259 fix. + if !strings.HasPrefix(fe.gotPod, "tracebloc-stage-") { + t.Errorf("rm ran in pod %q, want the ephemeral stage pod (tracebloc-stage-*); "+ + "running it in the jobs-manager pod is the #259 bug", fe.gotPod) + } + if strings.Contains(fe.gotPod, "jobs-manager") { + t.Errorf("rm ran in the jobs-manager pod (%q) — the #259 regression", fe.gotPod) + } + if fe.gotContainer != "stage" { + t.Errorf("rm container = %q, want stage", fe.gotContainer) + } + wantCmd := "rm -rf " + strings.Join(plan.PVCPaths, " ") + if got := strings.Join(fe.gotCmd, " "); got != wantCmd { + t.Errorf("rm cmd = %q, want %q", got, wantCmd) + } + + // The teardown pod must run as the stage uid (65532) + fsGroup so it + // OWNS the staging files it deletes. Inspect the created Pod via the + // fake clientset action log (the pod is deleted before the test ends). + var sc *corev1.PodSecurityContext + for _, action := range cs.Actions() { + if action.GetVerb() == "create" && action.GetResource().Resource == "pods" { + sc = action.(k8stesting.CreateAction).GetObject().(*corev1.Pod).Spec.SecurityContext + break + } + } + if sc == nil { + t.Fatal("no Pod create observed — teardown did not spawn an ephemeral pod") + } + if sc.RunAsUser == nil || *sc.RunAsUser != 65532 { + t.Errorf("teardown pod RunAsUser = %v, want 65532", sc.RunAsUser) + } + if sc.FSGroup == nil || *sc.FSGroup != 65532 { + t.Errorf("teardown pod FSGroup = %v, want 65532", sc.FSGroup) + } + + // No leaked teardown pods after a successful run. + pods, _ := cs.CoreV1().Pods("tracebloc").List(context.Background(), + metav1.ListOptions{LabelSelector: StagePodManagedByLabel + "=" + StagePodManagedByValue}) + if len(pods.Items) != 0 { + t.Errorf("Teardown leaked %d stage pod(s)", len(pods.Items)) + } +} diff --git a/internal/schema/ingest.v1.json b/internal/schema/ingest.v1.json index d8eb83c..b5332b9 100644 --- a/internal/schema/ingest.v1.json +++ b/internal/schema/ingest.v1.json @@ -30,13 +30,14 @@ "object_detection", "keypoint_detection", "semantic_segmentation", - "instance_segmentation", "text_classification", + "token_classification", "tabular_classification", "tabular_regression", "time_series_forecasting", "time_to_event_prediction", - "masked_language_modeling" + "masked_language_modeling", + "causal_language_modeling" ], "description": "Task category. Drives convention defaults: validators, data_format, default columns, default file extensions, default validator set." }, @@ -86,7 +87,7 @@ "texts": { "type": "string", "minLength": 1, - "description": "Directory holding text files referenced by the labels CSV. Required for text_classification." + "description": "Directory holding text files referenced by the labels CSV. Required for text_classification, token_classification, and causal_language_modeling (raw .txt: plain text, or a tab-separated prompt\\tcompletion pair)." }, "sequences": { @@ -143,7 +144,7 @@ "items": { "type": "integer", "minimum": 1 }, "minItems": 2, "maxItems": 2, - "description": "[height, width] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset." + "description": "[width, height] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset. The order matches PIL.Image.size and what ImageResolutionValidator expects." }, "number_of_keypoints": { @@ -198,7 +199,7 @@ "items": { "type": "integer", "minimum": 1 }, "minItems": 2, "maxItems": 2, - "description": "[height, width]. Image categories only. Default [512, 512]." + "description": "[width, height]. Image categories only. Default [512, 512]. The order matches PIL.Image.size and what ImageResolutionValidator expects." }, "extension": { "type": "string", @@ -288,8 +289,7 @@ "image_classification", "object_detection", "keypoint_detection", - "semantic_segmentation", - "instance_segmentation" + "semantic_segmentation" ] } }, @@ -321,6 +321,14 @@ }, "then": { "required": ["texts"] } }, + { + "description": "token_classification requires `texts`.", + "if": { + "properties": { "category": { "const": "token_classification" } }, + "required": ["category"] + }, + "then": { "required": ["texts"] } + }, { "description": "masked_language_modeling requires `sequences`.", "if": { @@ -329,6 +337,14 @@ }, "then": { "required": ["sequences"] } }, + { + "description": "causal_language_modeling requires `texts` (raw .txt samples). It is self-supervised, so unlike text/token classification it does NOT require `label`.", + "if": { + "properties": { "category": { "const": "causal_language_modeling" } }, + "required": ["category"] + }, + "then": { "required": ["texts"] } + }, { "description": "tabular and time-series categories require `schema`.", "if": { @@ -388,8 +404,8 @@ "object_detection", "keypoint_detection", "semantic_segmentation", - "instance_segmentation", "text_classification", + "token_classification", "tabular_classification" ] } @@ -397,6 +413,21 @@ "required": ["category"] }, "then": { "required": ["label"] } + }, + { + "description": "Self-supervised categories MUST NOT set `label`. The shipped CSV has no label column, and the framework registers no edge-label metadata for them. Setting `label` anyway used to ingest rows successfully, then crash at backend registration with a misleading HTTP 400 'No data found' (issue #213). Reject at submission instead.", + "if": { + "properties": { + "category": { + "enum": [ + "masked_language_modeling", + "causal_language_modeling" + ] + } + }, + "required": ["category"] + }, + "then": { "not": { "required": ["label"] } } } ] } diff --git a/internal/slug/slug.go b/internal/slug/slug.go new file mode 100644 index 0000000..fb3f8a0 --- /dev/null +++ b/internal/slug/slug.go @@ -0,0 +1,94 @@ +// Package slug ports the RFC-0001 namespace-slug rule (backend +// common/utils/slug.py, RFC-0001 Appendix B) to Go. A client's display name is +// slugified ONCE at provisioning into the immutable Kubernetes namespace, so +// this MUST stay in lock-step with the Python definition — the backend +// validates exactly what this produces (RFC-0001 backend#830; provisioning + +// namespace validation in backend#836). +package slug + +import ( + "fmt" + "regexp" + "strings" + + "golang.org/x/text/unicode/norm" +) + +// MaxLabelLength is the DNS-1123 label cap. +const MaxLabelLength = 63 + +var ( + nonAlnum = regexp.MustCompile(`[^a-z0-9]+`) + multiDash = regexp.MustCompile(`-+`) +) + +// Slugify maps name to a DNS-1123 label, or "" if nothing survives. Mirrors +// slug.slugify_dns1123: NFKD-transliterate to ASCII, lowercase, map every run +// of non-alphanumerics to a single "-", trim leading/trailing "-", cap at 63. +func Slugify(name string) string { + if name == "" { + return "" + } + s := strings.ToLower(toASCII(name)) + s = nonAlnum.ReplaceAllString(s, "-") + // Redundant after the run-collapse above, but slug.py runs the identical + // second pass — mirror it so the two stay structurally in lock-step. + s = multiDash.ReplaceAllString(s, "-") + s = strings.Trim(s, "-") + if len(s) > MaxLabelLength { + s = s[:MaxLabelLength] + } + return strings.TrimRight(s, "-") +} + +// Derive returns a UNIQUE DNS-1123 slug for name, avoiding taken. On collision +// it appends -2, -3, … within the 63-char cap. If name slugifies to empty it +// falls back to fallback; with an empty fallback it errors (an empty slug must +// never silently become a namespace). Mirrors slug.derive_slug — except Go has +// no nil string, so fallback=="" is treated as "no fallback" (erroring); the +// Python distinguishes None from "" but no caller passes "", and erroring is +// the safer side of that edge. +func Derive(name string, taken []string, fallback string) (string, error) { + base := Slugify(name) + if base == "" { + if fallback == "" { + return "", fmt.Errorf("name %q slugifies to empty; a fallback is required", name) + } + if base = Slugify(fallback); base == "" { + base = fallback + } + } + set := make(map[string]struct{}, len(taken)) + for _, t := range taken { + set[t] = struct{}{} + } + if _, clash := set[base]; !clash { + return base, nil + } + for n := 2; ; n++ { + suffix := fmt.Sprintf("-%d", n) + end := MaxLabelLength - len(suffix) + if end > len(base) { + end = len(base) + } + if end < 0 { + end = 0 + } + cand := strings.TrimRight(base[:end], "-") + suffix + if _, clash := set[cand]; !clash { + return cand, nil + } + } +} + +func toASCII(s string) string { + // NFKD decompose then drop non-ASCII — matches Python's + // unicodedata.normalize("NFKD", s).encode("ascii", "ignore"). + var b strings.Builder + for _, r := range norm.NFKD.String(s) { + if r < 128 { + b.WriteByte(byte(r)) + } + } + return b.String() +} diff --git a/internal/slug/slug_golden_test.go b/internal/slug/slug_golden_test.go new file mode 100644 index 0000000..04e25c2 --- /dev/null +++ b/internal/slug/slug_golden_test.go @@ -0,0 +1,44 @@ +package slug + +import "testing" + +// goldenPairs are (display name → DNS-1123 slug) pairs verified BYTE-IDENTICAL +// against the canonical Python slugify_dns1123 (backend common/utils/slug.py) +// with a cross-language harness. They lock the NFKD transliteration — the one +// place a Go port can silently drift from the backend validator that rejects +// what this produces. If slug.py ever changes, re-run the harness and update +// these rather than hand-editing. +var goldenPairs = []struct{ name, want string }{ + {"My Client", "my-client"}, + {"café", "cafe"}, // canonical NFKD: é → e + accent (dropped) + {"CAFÉ", "cafe"}, // + lowercase + {"Müller GmbH", "muller-gmbh"}, + {"Straße 42", "strae-42"}, // ß does not decompose → dropped (matches Python) + {"naïve", "naive"}, + {"São Paulo", "sao-paulo"}, + {"Zürich-Lab", "zurich-lab"}, + {"北京 client", "client"}, // CJK dropped + {"client🚀rocket", "clientrocket"}, // emoji dropped + {"über_cool", "uber-cool"}, + {"piñata", "pinata"}, + {"Ω omega", "omega"}, + {"fi-ligature", "fi-ligature"}, // compat NFKD: fi ligature → "fi" + {"①②③ circled", "123-circled"}, // circled digits → "123" + {"½ half", "12-half"}, // vulgar fraction → "1" + fraction-slash(dropped) + "2" + {"FULL width", "full-width"}, // fullwidth → ASCII + {"trailing---dashes---", "trailing-dashes"}, + {"UPPER MixED", "upper-mixed"}, + {"a.b.c-d_e f", "a-b-c-d-e-f"}, + {"2001: A Space Odyssey", "2001-a-space-odyssey"}, + {"Ⅻ roman", "xii-roman"}, // roman numeral → "XII" → "xii" + {"²cubed³", "2cubed3"}, // super/subscripts → digits + {"Hello @ World #1", "hello-world-1"}, +} + +func TestSlugifyGoldenParity(t *testing.T) { + for _, p := range goldenPairs { + if got := Slugify(p.name); got != p.want { + t.Errorf("Slugify(%q) = %q, want %q (golden parity with slug.py)", p.name, got, p.want) + } + } +} diff --git a/internal/slug/slug_test.go b/internal/slug/slug_test.go new file mode 100644 index 0000000..ac5c516 --- /dev/null +++ b/internal/slug/slug_test.go @@ -0,0 +1,45 @@ +package slug + +import ( + "strings" + "testing" +) + +func TestSlugify(t *testing.T) { + cases := map[string]string{ + "My Client": "my-client", + "café": "cafe", // NFKD transliteration, matches slug.py + " spaces ": "spaces", + "UPPER_Case": "upper-case", + "a--b": "a-b", // collapse consecutive dashes + "--trim--": "trim", + "": "", + "!!!": "", + strings.Repeat("a", 70): strings.Repeat("a", 63), // 63-char cap + } + for in, want := range cases { + if got := Slugify(in); got != want { + t.Errorf("Slugify(%q) = %q, want %q", in, got, want) + } + } +} + +func TestDerive(t *testing.T) { + if got, err := Derive("My Client", nil, ""); err != nil || got != "my-client" { + t.Errorf("no collision: got %q, err %v", got, err) + } + if got, _ := Derive("My Client", []string{"my-client"}, ""); got != "my-client-2" { + t.Errorf("collision: got %q, want my-client-2", got) + } + if got, _ := Derive("My Client", []string{"my-client", "my-client-2"}, ""); got != "my-client-3" { + t.Errorf("double collision: got %q, want my-client-3", got) + } + // CJK slugifies to empty → uses fallback + if got, _ := Derive("世界", nil, "client-abc"); got != "client-abc" { + t.Errorf("fallback: got %q, want client-abc", got) + } + // empty slug + no fallback → error (never a silent-empty namespace) + if _, err := Derive("!!!", nil, ""); err == nil { + t.Error("expected an error for empty slug with no fallback") + } +}