From 629773d21bed168ea054c1e19b6d8bafc81832a8 Mon Sep 17 00:00:00 2001 From: shujaat hasan Date: Tue, 2 Jun 2026 14:12:59 +0200 Subject: [PATCH 1/2] fix(resource-monitor): always grant read-only ClusterRole (decouple from clusterScope) Under clusterScope: false the chart rendered only a namespace-scoped Role in the release namespace. But the resource-monitor's code: * calls core_v1_api.list_pod_for_all_namespaces(field_selector=spec.nodeName=...) -- a CLUSTER-SCOPED list verb a namespaced Role can never satisfy; and * read_namespaced_pod()s its OWN pod, which lives in .Values.nodeAgents.namespace.name (NOT .Release.Namespace). So with clusterScope: false the DaemonSet 403'd on startup and crashlooped (70+ restarts observed on a live cluster). Per-node monitoring is intrinsically cluster-scoped. Always render the read-only ClusterRole + ClusterRoleBinding regardless of clusterScope (get/list/watch on pods/nodes/namespaces + metrics; no write, exec, or secret access). resourceMonitor: false still fully disables the component. clusterScope continues to gate the training/jobs isolation footprint elsewhere -- it must not leave the node monitor without permissions it cannot run without. Co-Authored-By: Claude Opus 4.8 --- client/templates/resource-monitor-rbac.yaml | 64 ++++++--------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/client/templates/resource-monitor-rbac.yaml b/client/templates/resource-monitor-rbac.yaml index af59347..59ead14 100644 --- a/client/templates/resource-monitor-rbac.yaml +++ b/client/templates/resource-monitor-rbac.yaml @@ -11,7 +11,24 @@ metadata: labels: {{- include "tracebloc.labels" . | nindent 4 }} app: {{ include "tracebloc.resourceMonitorName" . }} -{{- if ne .Values.clusterScope false }} +{{/* + Per-node monitoring is INTRINSICALLY cluster-scoped, so the resource-monitor + always needs a ClusterRole -- it is deliberately NOT gated on .Values.clusterScope. + The code path that requires it: + * resource_monitor.py uses core_v1_api.list_pod_for_all_namespaces( + field_selector="spec.nodeName=") to enumerate every pod on the node. + list_pod_for_all_namespaces is a CLUSTER-SCOPED list verb that a namespaced + Role can never satisfy (it 403s -> CrashLoopBackOff). + * It also read_namespaced_pod()s its OWN pod, which lives in + .Values.nodeAgents.namespace.name (NOT .Release.Namespace), so a Role scoped + to the release namespace would miss it too. + This ClusterRole is strictly READ-ONLY (get/list/watch on pod/node metadata + + metrics); it grants no write/exec/secret access. clusterScope continues to gate + the training/jobs isolation footprint elsewhere -- it must not cripple node + telemetry by leaving the DaemonSet without the permissions it cannot run without. + If a deployment genuinely cannot allow any cluster-scoped read, disable the + monitor entirely via .Values.resourceMonitor=false rather than deploying it broken. +*/}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -49,49 +66,4 @@ subjects: - kind: ServiceAccount name: {{ include "tracebloc.resourceMonitorName" . }} namespace: {{ .Values.nodeAgents.namespace.name }} -{{- else }} ---- -# Role + RoleBinding live in the RELEASE namespace so the resource-monitor -# can list pods/logs where the actual workloads run. The RoleBinding -# subject points at the ServiceAccount in the node-agents namespace -# (cross-namespace bindings are valid; the Role scope is what matters). -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: tracebloc-resource-monitor-{{ .Release.Name }} - namespace: {{ .Release.Namespace }} - annotations: - meta.helm.sh/release-name: {{ .Release.Name }} - meta.helm.sh/release-namespace: {{ .Release.Namespace }} - labels: - {{- include "tracebloc.labels" . | nindent 4 }} - app: {{ include "tracebloc.resourceMonitorName" . }} -rules: - - apiGroups: [""] - resources: ["pods", "pods/log"] - verbs: ["get", "list", "watch"] - - apiGroups: ["metrics.k8s.io"] - resources: ["pods"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: tracebloc-resource-monitor-{{ .Release.Name }} - namespace: {{ .Release.Namespace }} - annotations: - meta.helm.sh/release-name: {{ .Release.Name }} - meta.helm.sh/release-namespace: {{ .Release.Namespace }} - labels: - {{- include "tracebloc.labels" . | nindent 4 }} - app: {{ include "tracebloc.resourceMonitorName" . }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: tracebloc-resource-monitor-{{ .Release.Name }} -subjects: - - kind: ServiceAccount - name: {{ include "tracebloc.resourceMonitorName" . }} - namespace: {{ .Values.nodeAgents.namespace.name }} -{{- end }} {{- end }} From 6a869d831cd148c06e53d814e7eeb4f38ab81f44 Mon Sep 17 00:00:00 2001 From: shujaat hasan Date: Tue, 2 Jun 2026 14:36:18 +0200 Subject: [PATCH 2/2] test(resource-monitor): assert always-cluster-scoped RBAC under clusterScope=false Follow-up to the RBAC fix: node_agents_namespace_test.yaml still asserted the old behavior (namespaced Role + RoleBinding in the release namespace when clusterScope=false). Update that case to assert the corrected contract -- a ClusterRole + ClusterRoleBinding always render (with no metadata.namespace), while the subject SA still lives in the node-agents namespace. The clusterScope=false path stays under test; only the asserted behavior changes to match the fix. Verified with `helm unittest` (all resource-monitor suites pass). Co-Authored-By: Claude Opus 4.8 --- client/tests/node_agents_namespace_test.yaml | 22 +++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/client/tests/node_agents_namespace_test.yaml b/client/tests/node_agents_namespace_test.yaml index 67d4214..f597a24 100644 --- a/client/tests/node_agents_namespace_test.yaml +++ b/client/tests/node_agents_namespace_test.yaml @@ -154,28 +154,30 @@ tests: path: subjects[0].namespace value: tracebloc-node-agents - - it: should keep namespace-scoped Role + RoleBinding in the release namespace when clusterScope is false, with SA subject in node-agents + - it: should still render a cluster-scoped ClusterRole + ClusterRoleBinding when clusterScope is false, with SA subject in node-agents template: templates/resource-monitor-rbac.yaml set: clusterScope: false asserts: - # Role must grant access where the monitored workloads live (release ns) + # Per-node monitoring relies on list_pod_for_all_namespaces (a cluster-scoped + # verb a namespaced Role can never satisfy) and reads its own pod in the + # node-agents namespace, so resource-monitor RBAC is intentionally decoupled + # from clusterScope -- it is ALWAYS cluster-scoped. clusterScope still gates + # the training/jobs isolation footprint elsewhere. - isKind: - of: Role + of: ClusterRole documentIndex: 1 - - equal: + # Cluster-scoped resources carry no metadata.namespace + - isNull: path: metadata.namespace - value: tracebloc-templates documentIndex: 1 - # RoleBinding sits in the release ns so it applies the Role there - isKind: - of: RoleBinding + of: ClusterRoleBinding documentIndex: 2 - - equal: + - isNull: path: metadata.namespace - value: tracebloc-templates documentIndex: 2 - # ...but the subject SA lives in the node-agents namespace + # ...but the subject SA still lives in the node-agents namespace - equal: path: subjects[0].namespace value: tracebloc-node-agents