diff --git a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go index b66e041d97..de74bc0b06 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go @@ -6,6 +6,7 @@ import ( "github.com/spf13/cobra" "k8s.io/cli-runtime/pkg/genericclioptions" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/evict" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/uninstall" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/update" @@ -35,6 +36,7 @@ func New(streams genericclioptions.IOStreams) *cobra.Command { cmd.AddCommand(install.New(streams)) cmd.AddCommand(uninstall.New(streams)) cmd.AddCommand(update.New(streams)) + cmd.AddCommand(evict.New(streams)) o := newOptions(streams) o.configFlags.AddFlags(cmd.Flags()) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node.go b/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node.go new file mode 100644 index 0000000000..050e07905d --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node.go @@ -0,0 +1,36 @@ +package aws + +import ( + "regexp" + + corev1 "k8s.io/api/core/v1" +) + +// awsProviderIDRegexp matches the AWS provider ID for EC2-backed nodes. +// Format: aws:////i- (e.g. aws:///us-east-1a/i-0abc123def456789). +// Fargate nodes use a different shape (aws:////fargate-ip-...) and must +// therefore be classified by label before reaching this regex. +var awsProviderIDRegexp = regexp.MustCompile(`^aws:///[^/]+/(i-[0-9a-f]+)$`) + +// LabelEKSNodegroup is the label EKS stamps on every node that belongs to a +// managed node group. The label value is the node group name. Exposed as a +// constant so every consumer (classifier, evict-legacy-nodes, future code) +// references the same string. +const LabelEKSNodegroup = "eks.amazonaws.com/nodegroup" + +// ExtractEC2InstanceID returns the EC2 instance ID (i-...) from a Node's +// providerID, or false when the providerID is not an EC2 instance (Fargate +// uses `aws:////fargate-ip-...`, GCP/Azure use entirely different shapes, +// etc.). Lives here in `common/aws` so both `common/clusterinfo` (which +// imports `common/karpenter`) and `common/karpenter` (which classifies its +// own nodes) can use it without creating an import cycle. +func ExtractEC2InstanceID(node *corev1.Node) (string, bool) { + if node == nil { + return "", false + } + m := awsProviderIDRegexp.FindStringSubmatch(node.Spec.ProviderID) + if len(m) != 2 { + return "", false + } + return m[1], true +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node_test.go new file mode 100644 index 0000000000..c13e9a0ad0 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/aws/node_test.go @@ -0,0 +1,38 @@ +package aws + +import ( + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" +) + +func TestExtractEC2InstanceID(t *testing.T) { + for _, tc := range []struct { + name string + provider string + wantID string + wantOK bool + }{ + {name: "ec2", provider: "aws:///eu-west-3a/i-0123456789abcdef0", wantID: "i-0123456789abcdef0", wantOK: true}, + {name: "ec2 short id", provider: "aws:///us-east-1b/i-abc123", wantID: "i-abc123", wantOK: true}, + {name: "fargate", provider: "aws:///eu-west-3a/fargate-ip-10-0-1-2", wantOK: false}, + {name: "gcp", provider: "gce://project/zone/instance", wantOK: false}, + {name: "empty", provider: "", wantOK: false}, + {name: "missing prefix", provider: "i-0123456789abcdef0", wantOK: false}, + {name: "missing AZ", provider: "aws:////i-0123456789abcdef0", wantOK: false}, + } { + t.Run(tc.name, func(t *testing.T) { + node := &corev1.Node{Spec: corev1.NodeSpec{ProviderID: tc.provider}} + id, ok := ExtractEC2InstanceID(node) + assert.Equal(t, tc.wantOK, ok) + if tc.wantOK { + assert.Equal(t, tc.wantID, id) + } + }) + } + t.Run("nil node", func(t *testing.T) { + _, ok := ExtractEC2InstanceID(nil) + assert.False(t, ok) + }) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go index 0403b5ae63..79d1528d7a 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go @@ -5,7 +5,6 @@ import ( "fmt" "log" "maps" - "regexp" "strings" "time" @@ -36,11 +35,6 @@ import ( // ValidationError at the API. const describeASGInstancesMaxIDs = 50 -// awsProviderIDRegexp matches the AWS provider ID for EC2-backed nodes. -// Format: aws:////i-. Fargate nodes use a different shape and -// must therefore be classified by label before reaching this regex. -var awsProviderIDRegexp = regexp.MustCompile(`^aws:///[^/]+/(i-[0-9a-f]+)$`) - // nodePoolDatadogCreatedLabel is the label set by every Datadog autoscaling // product (kubectl-datadog AND the cluster agent) on the NodePools they // manage. Broader than the AND-pair `app.kubernetes.io/managed-by: @@ -173,10 +167,9 @@ func classifyByLabels(ctx context.Context, k8sClient kubernetes.Interface, farga return nil } - matches := awsProviderIDRegexp.FindStringSubmatch(node.Spec.ProviderID) - if len(matches) == 2 { + if id, ok := commonaws.ExtractEC2InstanceID(node); ok { candidates = append(candidates, asgCandidate{ - instanceID: matches[1], + instanceID: id, nodeName: node.Name, }) } else { diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/karpenter/fromnodes.go b/cmd/kubectl-datadog/autoscaling/cluster/common/karpenter/fromnodes.go index 504f37850b..d3783fd70e 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/karpenter/fromnodes.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/karpenter/fromnodes.go @@ -5,7 +5,6 @@ import ( "fmt" "log" "maps" - "regexp" "slices" "strings" @@ -17,11 +16,9 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/pager" -) -// awsProviderIDRegexp matches the AWS provider ID format for EC2 instances. -// Format: aws:///ZONE/INSTANCE_ID (e.g., aws:///us-east-1a/i-0abc123def456789) -var awsProviderIDRegexp = regexp.MustCompile(`^aws:///[^/]+/(i-[0-9a-f]+)$`) + commonaws "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/aws" +) // ec2DescribeBatchSize bounds the number of instance IDs we hand to a single // ec2:DescribeInstances / ec2:DescribeImages call. The K8s pager streams @@ -62,12 +59,12 @@ func GetNodesProperties(ctx context.Context, clientset *kubernetes.Clientset, ec if _, isKarpenter := node.Labels["karpenter.k8s.aws/ec2nodeclass"]; isKarpenter { return nil } - matches := awsProviderIDRegexp.FindStringSubmatch(node.Spec.ProviderID) - if len(matches) != 2 { + id, ok := commonaws.ExtractEC2InstanceID(node) + if !ok { log.Printf("Skipping node %s with unexpected provider ID: %s", node.Name, node.Spec.ProviderID) return nil } - pending[matches[1]] = pendingNode{labels: node.Labels, taints: node.Spec.Taints} + pending[id] = pendingNode{labels: node.Labels, taints: node.Spec.Taints} if len(pending) >= ec2DescribeBatchSize { return flush() } diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/asg.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/asg.go new file mode 100644 index 0000000000..cdef49388e --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/asg.go @@ -0,0 +1,169 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" + "k8s.io/client-go/kubernetes" + + commonaws "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/aws" +) + +// AutoscalingAPI is the subset of *autoscaling.Client used by evictASG. +// Defined as an interface so unit tests can stub the AWS SDK out cheaply. +type AutoscalingAPI interface { + UpdateAutoScalingGroup(ctx context.Context, in *autoscaling.UpdateAutoScalingGroupInput, opts ...func(*autoscaling.Options)) (*autoscaling.UpdateAutoScalingGroupOutput, error) + SuspendProcesses(ctx context.Context, in *autoscaling.SuspendProcessesInput, opts ...func(*autoscaling.Options)) (*autoscaling.SuspendProcessesOutput, error) + TerminateInstanceInAutoScalingGroup(ctx context.Context, in *autoscaling.TerminateInstanceInAutoScalingGroupInput, opts ...func(*autoscaling.Options)) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) +} + +// evictASG cordons every node in the ASG up front, then drains them, +// terminating each node's instance — and decrementing the ASG's desired +// capacity — as soon as that node has drained cleanly, so its capacity is +// freed without waiting for the rest of the group. Cordoning the whole group +// before any drain (as EKS does for a managed node group) keeps a pod evicted +// from one node from landing on a sibling node that is itself about to be +// drained. Once every node has drained, the ASG is locked at min=max=desired=0 +// so nothing is ever relaunched. +// +// Safety rules: +// +// 1. An instance is only terminated once its node has drained cleanly. If a +// drain fails, that instance is left running (its pods are still on it) +// and the ASG is left at MinSize=0 with AZRebalance suspended but is NOT +// locked at MaxSize=0, so a re-run can pick up where this one stopped. +// 2. Per-instance termination requires two precautions, applied up front by +// prepareASGForTermination: (a) MinSize=0, because AWS rejects +// TerminateInstanceInAutoScalingGroup with ShouldDecrementDesiredCapacity +// while MinSize == DesiredCapacity; and (b) suspending the AZRebalance +// process, because decrementing desired capacity one instance at a time +// can otherwise trigger AZ rebalancing that terminates a not-yet-drained +// instance in another AZ. +// 3. Crash window: because instances are terminated before the final +// MaxSize=0 lock, a crash (or lost AWS connectivity) after the last +// termination but before the lock leaves the ASG at DesiredCapacity=0 +// (so it will not relaunch on its own) but with its original MaxSize. +// A re-run cannot rediscover it — target discovery is driven by surviving +// Kubernetes nodes, which are now gone — so the operator must re-lock +// MaxSize manually if an external scaler might raise the desired capacity. +// In practice the command also scales cluster-autoscaler to zero, so +// nothing routinely raises it. +// +// We never delete the ASG: it may be managed by Terraform/CloudFormation/Helm, +// and only the original owner should remove it. +func evictASG(ctx context.Context, clientset kubernetes.Interface, asg AutoscalingAPI, asgName string, nodes []string, drainOpts nodeDrainOptions) error { + if drainOpts.DryRun { + log.Printf("[dry-run] would suspend AZRebalance and set MinSize=0 on ASG %s", asgName) + } else if err := prepareASGForTermination(ctx, asg, asgName); err != nil { + return fmt.Errorf("prepare ASG %s for termination: %w", asgName, err) + } + + // Cordon every node up front so a pod evicted from one node is never + // rescheduled onto another node of the same ASG that is itself about to be + // drained. A node that fails to cordon is left undrained; treat that as a + // drain failure so the ASG keeps its original MaxSize for a re-run instead + // of being locked away with workloads still on it. + cordoned, errs := cordonNodes(ctx, clientset, nodes, drainOpts.DryRun) + + for _, node := range cordoned { + nodeName := node.Name + id, hasInstanceID := commonaws.ExtractEC2InstanceID(node) + if !hasInstanceID { + log.Printf("Warning: node %s has unexpected providerID %q; its instance will be terminated by the final scale-to-zero instead", nodeName, node.Spec.ProviderID) + } + if err := drainNode(ctx, clientset, nodeName, drainOpts); err != nil { + errs = append(errs, fmt.Errorf("drain node %s: %w", nodeName, err)) + continue // do NOT terminate this instance: workloads are still on it + } + // The node drained cleanly: terminate its instance now, decrementing + // the ASG's desired capacity so it is not relaunched. Nodes with an + // unexpected providerID are left for the final scale-to-zero. + if !hasInstanceID { + continue + } + if drainOpts.DryRun { + log.Printf("[dry-run] would terminate instance %s in ASG %s (decrementing desired capacity)", id, asgName) + continue + } + if err := terminateASGInstance(ctx, asg, id); err != nil { + errs = append(errs, fmt.Errorf("terminate instance %s in ASG %s: %w", id, asgName, err)) + } + } + + if len(errs) > 0 { + log.Printf("ASG %s: at least one node failed to cordon, drain, or terminate; leaving the ASG at MinSize=0 without locking MaxSize. Re-run after addressing the errors above.", asgName) + return errors.Join(errs...) + } + + // Every node drained and its instance was terminated. Lock the ASG at + // min=max=desired=0 so nothing is ever relaunched, and to clean up any + // instance that couldn't be terminated per-node (unexpected providerID). + if drainOpts.DryRun { + log.Printf("[dry-run] would scale ASG %s to min=max=desired=0", asgName) + return nil + } + // All instances are now terminated; only the MaxSize=0 lock remains. A + // crash here would leave the ASG at desired=0 but unlocked and no longer + // rediscoverable by a re-run (see the crash-window note on evictASG). + log.Printf("ASG %s: all nodes drained; locking the ASG at min=max=desired=0.", asgName) + if err := scaleASGToZero(ctx, asg, asgName); err != nil { + errs = append(errs, fmt.Errorf("scale ASG %s to 0: %w", asgName, err)) + } + return errors.Join(errs...) +} + +// prepareASGForTermination makes the ASG safe for the per-instance termination +// performed during the drain loop: +// +// - AZRebalance is suspended so that decrementing desired capacity one +// instance at a time cannot trigger AZ rebalancing — which would terminate +// a not-yet-drained instance in another availability zone. +// - MinSize is set to 0 so that TerminateInstanceInAutoScalingGroup may +// decrement DesiredCapacity (AWS rejects the decrement while +// MinSize == DesiredCapacity). +func prepareASGForTermination(ctx context.Context, asg AutoscalingAPI, asgName string) error { + if _, err := asg.SuspendProcesses(ctx, &autoscaling.SuspendProcessesInput{ + AutoScalingGroupName: awssdk.String(asgName), + ScalingProcesses: []string{"AZRebalance"}, + }); err != nil { + return fmt.Errorf("suspend AZRebalance: %w", err) + } + if _, err := asg.UpdateAutoScalingGroup(ctx, &autoscaling.UpdateAutoScalingGroupInput{ + AutoScalingGroupName: awssdk.String(asgName), + MinSize: awssdk.Int32(0), + }); err != nil { + return fmt.Errorf("set MinSize=0: %w", err) + } + log.Printf("Prepared ASG %s for termination (AZRebalance suspended, MinSize=0).", asgName) + return nil +} + +// terminateASGInstance terminates a single (already drained) instance and +// decrements the ASG's desired capacity so it is not relaunched. +func terminateASGInstance(ctx context.Context, asg AutoscalingAPI, instanceID string) error { + if _, err := asg.TerminateInstanceInAutoScalingGroup(ctx, &autoscaling.TerminateInstanceInAutoScalingGroupInput{ + InstanceId: awssdk.String(instanceID), + ShouldDecrementDesiredCapacity: awssdk.Bool(true), + }); err != nil { + return err + } + log.Printf("Terminated instance %s and decremented ASG desired capacity.", instanceID) + return nil +} + +func scaleASGToZero(ctx context.Context, asg AutoscalingAPI, asgName string) error { + if _, err := asg.UpdateAutoScalingGroup(ctx, &autoscaling.UpdateAutoScalingGroupInput{ + AutoScalingGroupName: awssdk.String(asgName), + MinSize: awssdk.Int32(0), + MaxSize: awssdk.Int32(0), + DesiredCapacity: awssdk.Int32(0), + }); err != nil { + return err + } + log.Printf("Scaled ASG %s to min=max=desired=0.", asgName) + return nil +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/asg_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/asg_test.go new file mode 100644 index 0000000000..e061f1b1db --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/asg_test.go @@ -0,0 +1,353 @@ +package evict + +import ( + "context" + "errors" + "slices" + "testing" + "time" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +// stubAutoscaling implements AutoscalingAPI by recording each call. Returning +// a non-nil output keeps the production code happy without exercising any AWS +// SDK middleware. +type stubAutoscaling struct { + updates []autoscaling.UpdateAutoScalingGroupInput // captured UpdateAutoScalingGroup inputs, in order + suspendedAZRebalance []string // asg names for which AZRebalance was suspended + terminated []string // instance IDs terminated in-ASG + suspendErr error // returned by SuspendProcesses + prepUpdateErr error // returned by the MinSize-only prep update + lockErr error // returned by the final min=max=desired=0 lock update + terminateErr error // returned by TerminateInstanceInAutoScalingGroup +} + +func (s *stubAutoscaling) UpdateAutoScalingGroup(_ context.Context, in *autoscaling.UpdateAutoScalingGroupInput, _ ...func(*autoscaling.Options)) (*autoscaling.UpdateAutoScalingGroupOutput, error) { + s.updates = append(s.updates, *in) + // The prep update sets MinSize only (MaxSize left nil); the final lock + // update sets all three fields. Fail the one the test asked for. + if in.MaxSize == nil { + return &autoscaling.UpdateAutoScalingGroupOutput{}, s.prepUpdateErr + } + return &autoscaling.UpdateAutoScalingGroupOutput{}, s.lockErr +} + +func (s *stubAutoscaling) SuspendProcesses(_ context.Context, in *autoscaling.SuspendProcessesInput, _ ...func(*autoscaling.Options)) (*autoscaling.SuspendProcessesOutput, error) { + if slices.Contains(in.ScalingProcesses, "AZRebalance") { + s.suspendedAZRebalance = append(s.suspendedAZRebalance, awssdk.ToString(in.AutoScalingGroupName)) + } + return &autoscaling.SuspendProcessesOutput{}, s.suspendErr +} + +func (s *stubAutoscaling) TerminateInstanceInAutoScalingGroup(_ context.Context, in *autoscaling.TerminateInstanceInAutoScalingGroupInput, _ ...func(*autoscaling.Options)) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) { + s.terminated = append(s.terminated, awssdk.ToString(in.InstanceId)) + return &autoscaling.TerminateInstanceInAutoScalingGroupOutput{}, s.terminateErr +} + +// minSizeZeroPrepped reports whether a "prep" update (MinSize=0 only, MaxSize +// left untouched) was issued for asgName. +func (s *stubAutoscaling) minSizeZeroPrepped(asgName string) bool { + return slices.ContainsFunc(s.updates, func(u autoscaling.UpdateAutoScalingGroupInput) bool { + return awssdk.ToString(u.AutoScalingGroupName) == asgName && + u.MinSize != nil && *u.MinSize == 0 && u.MaxSize == nil + }) +} + +// locked reports whether the final min=max=desired=0 update was issued for +// asgName. +func (s *stubAutoscaling) locked(asgName string) bool { + return slices.ContainsFunc(s.updates, func(u autoscaling.UpdateAutoScalingGroupInput) bool { + return awssdk.ToString(u.AutoScalingGroupName) == asgName && + u.MinSize != nil && *u.MinSize == 0 && + u.MaxSize != nil && *u.MaxSize == 0 && + u.DesiredCapacity != nil && *u.DesiredCapacity == 0 + }) +} + +func newDrainOpts(dryRun bool) nodeDrainOptions { + return nodeDrainOptions{ + DryRun: dryRun, + EvictionTimeout: time.Second, + NodeTimeout: time.Second, + PollInterval: 10 * time.Millisecond, + } +} + +func TestEvictASG(t *testing.T) { + ec2Node := func() *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-1"}, + Spec: corev1.NodeSpec{ProviderID: "aws:///eu-west-3a/i-0123456789abcdef0"}, + } + } + fargateNode := func() *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-1"}, + Spec: corev1.NodeSpec{ProviderID: "aws:///eu-west-3a/fargate-ip-10-0-1-2"}, + } + } + stuckNode := func() *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-stuck"}, + Spec: corev1.NodeSpec{ProviderID: "aws:///eu-west-3a/i-aaaaaaaaaaaaaaaaa"}, + } + } + stuckPod := func() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "blocker", Namespace: "default"}, + Spec: corev1.PodSpec{NodeName: "ip-stuck"}, + } + } + fastDrain := nodeDrainOptions{ + EvictionTimeout: 50 * time.Millisecond, + NodeTimeout: 100 * time.Millisecond, + PollInterval: 10 * time.Millisecond, + } + + for _, tc := range []struct { + name string + // objects pre-populate the fake clientset. + objects []runtime.Object + // installEvictionReactor wires a 200-OK Eviction subresource reactor + // that keeps the pod alive in the store (so drainNode hits its node + // timeout). Used only by the drain-failure case. + installEvictionReactor bool + // nodes is the slice passed to evictASG. + nodes []string + opts nodeDrainOptions + // terminateErr / lockErr inject AWS failures into the per-instance + // termination and the final lock update respectively. + terminateErr error + lockErr error + // wantErr=true requires evictASG to return a non-nil error. + wantErr bool + // wantPrepped: the ASG was prepped for termination (AZRebalance + // suspended + MinSize=0). + wantPrepped bool + // wantTerminated is the expected set of instance IDs terminated via + // TerminateInstanceInAutoScalingGroup. nil ⇒ none. + wantTerminated []string + // wantLocked: the final min=max=desired=0 update was issued. + wantLocked bool + // wantUnschedulable: per-node expected `Spec.Unschedulable`. Nodes + // absent from the map (or absent from the cluster) aren't checked. + wantUnschedulable map[string]bool + }{ + { + name: "happy path", + objects: []runtime.Object{ec2Node()}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + wantPrepped: true, + wantTerminated: []string{"i-0123456789abcdef0"}, + wantLocked: true, + wantUnschedulable: map[string]bool{"ip-1": true}, + }, + { + // ASG neutralization still happens even when the requested K8s + // node is already gone — the operator may be re-running after a + // partial earlier execution and the AWS-side scale-to-zero must + // still land. No per-instance termination: the node (hence its + // instance ID) is gone. + name: "node already gone", + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + wantPrepped: true, + wantLocked: true, + }, + { + name: "dry-run mutates nothing", + objects: []runtime.Object{ec2Node()}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(true), + wantUnschedulable: map[string]bool{"ip-1": false}, + // wantPrepped/wantLocked false, wantTerminated nil ⇒ no AWS + // mutation must happen. + }, + { + // A node with a non-EC2 providerID (e.g. Fargate-ish) yields no + // instance ID: it still drains, but per-instance termination is + // skipped and the instance is cleaned up by the final + // scale-to-zero instead. + name: "non-EC2 providerID drains, skips per-instance terminate, still locks", + objects: []runtime.Object{fargateNode()}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + wantPrepped: true, + wantLocked: true, + wantUnschedulable: map[string]bool{"ip-1": true}, + }, + { + // Safety regression: a node failing to drain (here a PDB-blocked + // pod stays in the fake store past the eviction reactor) must + // leave the instance running and the ASG unlocked (MaxSize not + // forced to 0) so a re-run can pick up where this one stopped. + // The up-front prep (MinSize=0, AZRebalance suspended) has + // happened and is idempotent on re-run. + name: "drain failure leaves ASG unlocked", + objects: []runtime.Object{stuckNode(), stuckPod()}, + installEvictionReactor: true, + nodes: []string{"ip-stuck"}, + opts: fastDrain, + wantErr: true, + wantPrepped: true, + wantLocked: false, + }, + { + // A per-instance termination failure must propagate and, like a + // drain failure, leave the ASG unlocked for a re-run. + name: "terminate failure leaves ASG unlocked", + objects: []runtime.Object{ec2Node()}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + terminateErr: errTerminate, + wantErr: true, + wantPrepped: true, + wantTerminated: []string{"i-0123456789abcdef0"}, + wantLocked: false, + }, + { + // A failure of the final lock update must surface as an error + // (the lock call was still attempted). + name: "final lock failure surfaces error", + objects: []runtime.Object{ec2Node()}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + lockErr: errLock, + wantErr: true, + wantPrepped: true, + wantTerminated: []string{"i-0123456789abcdef0"}, + wantLocked: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + client := fake.NewClientset(tc.objects...) + if tc.installEvictionReactor { + installPodEvictionReactor(client) + } + stub := &stubAutoscaling{terminateErr: tc.terminateErr, lockErr: tc.lockErr} + + err := evictASG(t.Context(), client, stub, "my-asg", tc.nodes, tc.opts) + if tc.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + if tc.wantPrepped { + assert.Equal(t, []string{"my-asg"}, stub.suspendedAZRebalance, "AZRebalance must be suspended once") + assert.True(t, stub.minSizeZeroPrepped("my-asg"), "MinSize=0 prep update must be issued") + } else { + assert.Empty(t, stub.suspendedAZRebalance, "AZRebalance must not be suspended") + assert.Empty(t, stub.updates, "no UpdateAutoScalingGroup call must happen") + } + + if tc.wantTerminated == nil { + assert.Empty(t, stub.terminated, "no instance must be terminated") + } else { + assert.ElementsMatch(t, tc.wantTerminated, stub.terminated) + } + + assert.Equal(t, tc.wantLocked, stub.locked("my-asg"), "final scale-to-zero") + + for nodeName, want := range tc.wantUnschedulable { + got, getErr := client.CoreV1().Nodes().Get(t.Context(), nodeName, metav1.GetOptions{}) + require.NoError(t, getErr) + assert.Equal(t, want, got.Spec.Unschedulable, "Spec.Unschedulable for %s", nodeName) + } + }) + } +} + +var ( + errSuspend = errors.New("suspend boom") + errPrepMin = errors.New("min-size boom") + errTerminate = errors.New("terminate boom") + errLock = errors.New("lock boom") +) + +// TestEvictASGCordonFailure verifies that when a node cannot be cordoned, it is +// left undrained and the ASG is NOT locked at MaxSize=0 — a re-run must be able +// to finish the job. The up-front prep (AZRebalance suspend + MinSize=0) still +// ran and is idempotent on the re-run. +func TestEvictASGCordonFailure(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-1"}, + Spec: corev1.NodeSpec{ProviderID: "aws:///eu-west-3a/i-0123456789abcdef0"}, + } + client := fake.NewClientset(node) + // Every node Update fails with a non-conflict error so the cordon gives up. + client.PrependReactor("update", "nodes", func(clienttesting.Action) (bool, runtime.Object, error) { + return true, nil, apierrors.NewInternalError(errors.New("cordon boom")) + }) + stub := &stubAutoscaling{} + + err := evictASG(t.Context(), client, stub, "my-asg", []string{"ip-1"}, newDrainOpts(false)) + require.Error(t, err) + + // Prep precedes the cordon, so it ran... + assert.Equal(t, []string{"my-asg"}, stub.suspendedAZRebalance, "AZRebalance suspended up front") + assert.True(t, stub.minSizeZeroPrepped("my-asg"), "MinSize=0 prep update issued up front") + // ...but the un-cordoned node was never drained or terminated, and the ASG + // stays unlocked so a re-run can finish. + assert.Empty(t, stub.terminated, "no instance terminated when the node failed to cordon") + assert.False(t, stub.locked("my-asg"), "ASG must not be locked when a node failed to cordon") +} + +// TestEvictASGPrepFailure covers the up-front prepareASGForTermination failure +// branches: when either the AZRebalance suspend or the MinSize=0 update fails, +// evictASG must abort before touching any node (no cordon, no drain, no +// termination, no lock) so a re-run can start cleanly. +func TestEvictASGPrepFailure(t *testing.T) { + ec2Node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-1"}, + Spec: corev1.NodeSpec{ProviderID: "aws:///eu-west-3a/i-0123456789abcdef0"}, + } + + for _, tc := range []struct { + name string + suspendErr error + prepUpdateErr error + // wantMinSizeUpdate: the MinSize=0 prep update was attempted (only + // reached when the suspend succeeded). + wantMinSizeUpdate bool + }{ + { + name: "suspend failure aborts before draining", + suspendErr: errSuspend, + }, + { + name: "min-size update failure aborts before draining", + prepUpdateErr: errPrepMin, + wantMinSizeUpdate: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + client := fake.NewClientset(ec2Node) + stub := &stubAutoscaling{suspendErr: tc.suspendErr, prepUpdateErr: tc.prepUpdateErr} + + err := evictASG(t.Context(), client, stub, "my-asg", []string{"ip-1"}, newDrainOpts(false)) + require.Error(t, err) + + // The drain loop must never have run: the node stays schedulable + // and no instance is terminated or locked. + assert.Empty(t, stub.terminated, "no instance must be terminated") + assert.False(t, stub.locked("my-asg"), "ASG must not be locked") + assert.Equal(t, tc.wantMinSizeUpdate, stub.minSizeZeroPrepped("my-asg"), "MinSize=0 prep update attempted") + + got, getErr := client.CoreV1().Nodes().Get(t.Context(), "ip-1", metav1.GetOptions{}) + require.NoError(t, getErr) + assert.False(t, got.Spec.Unschedulable, "node must not be cordoned when prep fails") + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler.go new file mode 100644 index 0000000000..0188f70510 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler.go @@ -0,0 +1,50 @@ +package evict + +import ( + "context" + "fmt" + "log" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// scaleDownClusterAutoscaler patches the cluster-autoscaler Deployment's +// /scale subresource to 0 replicas. Using the subresource keeps the change +// minimal — we never touch the rest of the Deployment spec, so the diff is +// invisible to a Helm/Argo controller that watches the image, env, etc. +// +// Wrapped in RetryOnConflict to survive races against any controller that +// updates the Deployment between our Get and our Update. +// +// Idempotent: a Deployment already at 0 replicas returns nil without an +// Update call. Best-effort against GitOps controllers — if Helm/Argo reverts +// the change, this command does not loop; the operator is expected to pause +// the GitOps reconciliation for cluster-autoscaler before running. +func scaleDownClusterAutoscaler(ctx context.Context, clientset kubernetes.Interface, ca clusterinfo.ClusterAutoscaler, dryRun bool) error { + if !ca.Present { + return nil + } + if dryRun { + log.Printf("[dry-run] would scale Deployment %s/%s to 0 replicas", ca.Namespace, ca.Name) + return nil + } + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + scale, err := clientset.AppsV1().Deployments(ca.Namespace).GetScale(ctx, ca.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get Deployment scale %s/%s: %w", ca.Namespace, ca.Name, err) + } + if scale.Spec.Replicas == 0 { + return nil + } + scale.Spec.Replicas = 0 + if _, err = clientset.AppsV1().Deployments(ca.Namespace).UpdateScale(ctx, ca.Name, scale, metav1.UpdateOptions{}); err != nil { + return err + } + log.Printf("Scaled Deployment %s/%s to 0 replicas.", ca.Namespace, ca.Name) + return nil + }) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler_test.go new file mode 100644 index 0000000000..6e82f4f0cf --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/clusterautoscaler_test.go @@ -0,0 +1,165 @@ +package evict + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + autoscalingv1 "k8s.io/api/autoscaling/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" + "k8s.io/utils/ptr" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// deploymentsGVR is the GroupVersionResource used to talk to the fake +// clientset's object tracker for Deployment objects. +var deploymentsGVR = schema.GroupVersionResource{Group: "apps", Version: "v1", Resource: "deployments"} + +// scaleReactor wires GetScale/UpdateScale on a fake clientset so the test +// observes the same read-modify-write surface as production. The fake doesn't +// natively support the Scale subresource, so we synthesize Scale objects from +// the underlying Deployment. The reactor talks to the underlying tracker +// directly (rather than via client.AppsV1()) because the fake holds an +// internal mutex while a reactor runs — calling back through the typed client +// would deadlock. +func scaleReactor(t *testing.T, client *fake.Clientset, conflictFirstUpdate bool) *int { + t.Helper() + updateCalls := 0 + tracker := client.Tracker() + client.PrependReactor("get", "deployments", func(action clienttesting.Action) (bool, runtime.Object, error) { + ga, ok := action.(clienttesting.GetAction) + if !ok || ga.GetSubresource() != "scale" { + return false, nil, nil + } + obj, err := tracker.Get(deploymentsGVR, ga.GetNamespace(), ga.GetName()) + if err != nil { + return true, nil, err + } + dep := obj.(*appsv1.Deployment) + var replicas int32 + if dep.Spec.Replicas != nil { + replicas = *dep.Spec.Replicas + } + return true, &autoscalingv1.Scale{ + ObjectMeta: metav1.ObjectMeta{Name: dep.Name, Namespace: dep.Namespace, ResourceVersion: dep.ResourceVersion}, + Spec: autoscalingv1.ScaleSpec{Replicas: replicas}, + }, nil + }) + client.PrependReactor("update", "deployments", func(action clienttesting.Action) (bool, runtime.Object, error) { + ua, ok := action.(clienttesting.UpdateAction) + if !ok || ua.GetSubresource() != "scale" { + return false, nil, nil + } + scale := ua.GetObject().(*autoscalingv1.Scale) + updateCalls++ + if conflictFirstUpdate && updateCalls == 1 { + return true, nil, apierrors.NewConflict( + schema.GroupResource{Group: "apps", Resource: "deployments"}, + scale.Name, + errors.New("forced conflict"), + ) + } + obj, err := tracker.Get(deploymentsGVR, ua.GetNamespace(), scale.Name) + if err != nil { + return true, nil, err + } + dep := obj.(*appsv1.Deployment).DeepCopy() + dep.Spec.Replicas = ptr.To(scale.Spec.Replicas) + if err := tracker.Update(deploymentsGVR, dep, ua.GetNamespace()); err != nil { + return true, nil, err + } + return true, scale, nil + }) + return &updateCalls +} + +func TestScaleDownClusterAutoscaler(t *testing.T) { + caDep := func(replicas int32) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster-autoscaler", Namespace: "kube-system"}, + Spec: appsv1.DeploymentSpec{Replicas: ptr.To(replicas)}, + } + } + caPresent := clusterinfo.ClusterAutoscaler{Present: true, Namespace: "kube-system", Name: "cluster-autoscaler"} + + for _, tc := range []struct { + name string + // dep, when non-nil, is pre-loaded into the fake clientset. + dep *appsv1.Deployment + // ca is what Run would have passed in from clusterinfo.Classify. + ca clusterinfo.ClusterAutoscaler + // conflictFirstUpdate forces the scaleReactor to return Conflict on + // the first UpdateScale call so RetryOnConflict has to refetch. + conflictFirstUpdate bool + dryRun bool + // wantUpdateCalls is the minimum number of UpdateScale invocations + // the test expects (>= because retries may happen). + wantUpdateCalls int + // wantReplicas, when non-nil, is the final value of the Deployment's + // Spec.Replicas after the call. + wantReplicas *int32 + }{ + { + name: "CA absent is a no-op", + ca: clusterinfo.ClusterAutoscaler{Present: false}, + }, + { + name: "replicas already 0 skips Update", + dep: caDep(0), + ca: caPresent, + wantUpdateCalls: 0, + wantReplicas: ptr.To(int32(0)), + }, + { + name: "scales from 3 to 0", + dep: caDep(3), + ca: caPresent, + wantUpdateCalls: 1, + wantReplicas: ptr.To(int32(0)), + }, + { + name: "retries on Conflict", + dep: caDep(2), + ca: caPresent, + conflictFirstUpdate: true, + wantUpdateCalls: 2, + wantReplicas: ptr.To(int32(0)), + }, + { + name: "dry-run touches nothing", + dep: caDep(3), + ca: caPresent, + dryRun: true, + wantUpdateCalls: 0, + wantReplicas: ptr.To(int32(3)), + }, + } { + t.Run(tc.name, func(t *testing.T) { + var objs []runtime.Object + if tc.dep != nil { + objs = append(objs, tc.dep) + } + client := fake.NewClientset(objs...) + calls := scaleReactor(t, client, tc.conflictFirstUpdate) + + require.NoError(t, scaleDownClusterAutoscaler(t.Context(), client, tc.ca, tc.dryRun)) + + assert.GreaterOrEqual(t, *calls, tc.wantUpdateCalls, "minimum UpdateScale calls") + if tc.wantReplicas == nil { + return + } + got, err := client.AppsV1().Deployments(tc.ca.Namespace).Get(t.Context(), tc.ca.Name, metav1.GetOptions{}) + require.NoError(t, err) + require.NotNil(t, got.Spec.Replicas) + assert.Equal(t, *tc.wantReplicas, *got.Spec.Replicas) + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon.go new file mode 100644 index 0000000000..97973875b6 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon.go @@ -0,0 +1,74 @@ +package evict + +import ( + "context" + "fmt" + "log" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" +) + +// cordonNodes marks every node in the group Unschedulable up front, before any +// of them is drained — mirroring how EKS marks a managed node group +// unschedulable before draining it. A pod evicted from one node is then never +// rescheduled onto another node of the same group that is itself about to be +// drained. +// +// It returns the Node objects that are now cordoned and safe to drain, so the +// caller can read their immutable fields (e.g. providerID) without a second +// Get. A node that is already gone is skipped silently (absent from both +// return values); a node that fails to cordon is recorded in errs and left out +// of cordoned so the caller never drains a node that can still receive pods. +func cordonNodes(ctx context.Context, clientset kubernetes.Interface, nodes []string, dryRun bool) (cordoned []*corev1.Node, errs []error) { + for _, nodeName := range nodes { + node, err := cordonNode(ctx, clientset, nodeName, dryRun) + if err != nil { + errs = append(errs, fmt.Errorf("cordon node %s: %w", nodeName, err)) + continue + } + if node != nil { + cordoned = append(cordoned, node) + } + } + return cordoned, errs +} + +// cordonNode marks the node Unschedulable and returns the resulting Node so the +// caller can read immutable fields (e.g. providerID) without a second Get. It +// returns (nil, nil) when the node is already gone: there is nothing to +// schedule onto a deleted node, and a re-run rediscovers any surviving nodes. +// The Get + mutate + Update is wrapped in RetryOnConflict to survive races +// against kubelet, the scheduler, and any other controller that mutates Node +// objects. +func cordonNode(ctx context.Context, clientset kubernetes.Interface, name string, dryRun bool) (*corev1.Node, error) { + var cordoned *corev1.Node + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + node, err := clientset.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return nil // node already gone: nothing to schedule on it + } + return fmt.Errorf("failed to get node %s: %w", name, err) + } + if dryRun { + log.Printf("[dry-run] would cordon node %s", name) + } else if !node.Spec.Unschedulable { + node.Spec.Unschedulable = true + node, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return nil // deleted concurrently: nothing to schedule on it + } + return fmt.Errorf("failed to update node %s: %w", name, err) + } + log.Printf("Cordoned node %s.", name) + } + cordoned = node + return nil + }) + return cordoned, err +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon_test.go new file mode 100644 index 0000000000..d325d9a311 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/cordon_test.go @@ -0,0 +1,134 @@ +package evict + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +func TestCordonNode(t *testing.T) { + for _, tc := range []struct { + name string + // initialUnschedulable is the node's starting `Spec.Unschedulable`. + initialUnschedulable bool + // conflictFirstUpdate forces a Conflict on the first Update so + // RetryOnConflict has to refetch and re-apply. + conflictFirstUpdate bool + // updateNotFound makes the Update return NotFound, simulating the node + // being deleted between the Get and the Update. + updateNotFound bool + dryRun bool + // wantUpdateCalls is the minimum number of Update invocations + // expected on the Nodes endpoint. + wantMinUpdateCalls int + wantUnschedulable bool + // wantNilNode expects cordonNode to return a nil Node (the node is gone). + wantNilNode bool + }{ + { + name: "schedulable node becomes cordoned", + wantMinUpdateCalls: 1, + wantUnschedulable: true, + }, + { + // Already cordoned ⇒ no Update issued (idempotent). + name: "already cordoned is a no-op", + initialUnschedulable: true, + wantMinUpdateCalls: 0, + wantUnschedulable: true, + }, + { + name: "retries on Conflict", + conflictFirstUpdate: true, + wantMinUpdateCalls: 2, + wantUnschedulable: true, + }, + { + name: "dry-run touches nothing", + dryRun: true, + wantMinUpdateCalls: 0, + wantUnschedulable: false, + }, + { + // Node deleted between the Get and the Update ⇒ NotFound on Update + // is a silent success (nothing to schedule onto a deleted node). + name: "deleted between get and update is a no-op", + updateNotFound: true, + wantMinUpdateCalls: 1, + wantUnschedulable: false, + wantNilNode: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "n1"}, + Spec: corev1.NodeSpec{Unschedulable: tc.initialUnschedulable}, + } + client := fake.NewClientset(node) + + var updateCalls int + client.PrependReactor("update", "nodes", func(_ clienttesting.Action) (bool, runtime.Object, error) { + updateCalls++ + if tc.conflictFirstUpdate && updateCalls == 1 { + return true, nil, apierrors.NewConflict( + schema.GroupResource{Resource: "nodes"}, "n1", errors.New("forced conflict"), + ) + } + if tc.updateNotFound { + return true, nil, apierrors.NewNotFound(schema.GroupResource{Resource: "nodes"}, "n1") + } + return false, nil, nil + }) + + gotNode, cordonErr := cordonNode(t.Context(), client, "n1", tc.dryRun) + require.NoError(t, cordonErr) + assert.Equal(t, tc.wantNilNode, gotNode == nil, "cordonNode nil-node contract") + + assert.GreaterOrEqual(t, updateCalls, tc.wantMinUpdateCalls, "minimum Update calls") + got, err := client.CoreV1().Nodes().Get(t.Context(), "n1", metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, tc.wantUnschedulable, got.Spec.Unschedulable) + }) + } +} + +// TestCordonNodes covers the up-front cordon pass: a live node is cordoned and +// returned (carrying its state so the caller needs no second Get), an +// already-gone node is a silent skip (no error, absent from the result, so a +// re-run after a partial execution is not derailed), and a node that fails to +// cordon is recorded as an error and excluded so the caller never drains a node +// that can still receive pods. +func TestCordonNodes(t *testing.T) { + client := fake.NewClientset( + &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "ip-live"}}, + &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "ip-bad"}}, + ) + // Updates to ip-bad fail with a non-conflict error so cordonNode gives up + // immediately (RetryOnConflict only retries Conflict). + client.PrependReactor("update", "nodes", func(action clienttesting.Action) (bool, runtime.Object, error) { + ua, ok := action.(clienttesting.UpdateAction) + if !ok || ua.GetObject().(*corev1.Node).Name != "ip-bad" { + return false, nil, nil + } + return true, nil, apierrors.NewInternalError(errors.New("update boom")) + }) + + // ip-gone is absent from the cluster: cordonNode must treat NotFound as a + // silent skip. + cordoned, errs := cordonNodes(t.Context(), client, []string{"ip-live", "ip-gone", "ip-bad"}, false) + + require.Len(t, cordoned, 1, "only the live, successfully-cordoned node is returned") + assert.Equal(t, "ip-live", cordoned[0].Name) + assert.True(t, cordoned[0].Spec.Unschedulable, "the returned node carries the cordoned state") + require.Len(t, errs, 1) + assert.ErrorContains(t, errs[0], "ip-bad") +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng.go new file mode 100644 index 0000000000..bfd31aa41b --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng.go @@ -0,0 +1,123 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + "time" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/eks" + ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + + commonaws "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/aws" +) + +// errEKSDrainIncomplete is returned (wrapped) by waitEKSNodegroupEmpty when +// the deadline expires while nodes still carry the node group label. The +// orchestrator uses errors.Is to distinguish this case ("EKS drain is in +// progress but slow — keep temp PDBs in place") from a failed +// UpdateNodegroupConfig call ("EKS has not started draining — cleanup is +// safe"). +var errEKSDrainIncomplete = errors.New("EKS managed node group drain did not complete within the timeout") + +// EKSManagedNodeGroupAPI is the subset of *eks.Client used by +// evictEKSManagedNodeGroup. Defined as an interface so unit tests can stub the +// AWS SDK out cheaply. +type EKSManagedNodeGroupAPI interface { + DescribeNodegroup(ctx context.Context, in *eks.DescribeNodegroupInput, opts ...func(*eks.Options)) (*eks.DescribeNodegroupOutput, error) + UpdateNodegroupConfig(ctx context.Context, in *eks.UpdateNodegroupConfigInput, opts ...func(*eks.Options)) (*eks.UpdateNodegroupConfigOutput, error) +} + +// evictEKSManagedNodeGroup delegates the drain to EKS by setting the node +// group's scaling config to min=desired=0 (max is preserved from the existing +// scaling config because the EKS API rejects `maxSize < 1`). The EKS control +// plane then cordons and evicts pods from each node (respecting +// PodDisruptionBudgets) before terminating the underlying EC2 instances. We +// do not cordon or evict from kubectl-datadog because doing so concurrently +// with the EKS-managed drain can produce confusing logs and double the load +// on the apiserver. +// +// UpdateNodegroupConfig returns immediately, but we then **wait** for the +// drain to actually complete by polling the Kubernetes API for nodes carrying +// the `eks.amazonaws.com/nodegroup=` label. Without that wait, the +// caller (Run) would proceed to cleanup any temporary PodDisruptionBudgets +// before EKS even started evicting, leaving cross-type workloads (a +// Deployment with replicas on EKS MNG and on an ASG/Karpenter NodePool, for +// instance) unprotected mid-eviction. +// +// We never delete the node group: it may be Terraform-/CloudFormation-managed, +// and only the original owner should remove it. +func evictEKSManagedNodeGroup(ctx context.Context, eksAPI EKSManagedNodeGroupAPI, clientset kubernetes.Interface, clusterName, nodegroupName string, drainOpts nodeDrainOptions) error { + if drainOpts.DryRun { + log.Printf("[dry-run] would scale EKS node group %s/%s to min=desired=0 (max preserved) and wait for EKS to drain", clusterName, nodegroupName) + return nil + } + // EKS rejects maxSize < 1 (see NodegroupScalingConfig API reference), so + // preserve the existing maxSize when scaling to zero. min/desired both 0 + // is enough to drain the group; keeping the original max means a later + // caller (e.g. Terraform) can scale back up to the previously configured + // ceiling without re-reading the desired max. + desc, err := eksAPI.DescribeNodegroup(ctx, &eks.DescribeNodegroupInput{ + ClusterName: awssdk.String(clusterName), + NodegroupName: awssdk.String(nodegroupName), + }) + if err != nil { + return fmt.Errorf("DescribeNodegroup: %w", err) + } + maxSize := int32(1) + if desc.Nodegroup != nil && desc.Nodegroup.ScalingConfig != nil && desc.Nodegroup.ScalingConfig.MaxSize != nil && *desc.Nodegroup.ScalingConfig.MaxSize >= 1 { + maxSize = *desc.Nodegroup.ScalingConfig.MaxSize + } + if _, err := eksAPI.UpdateNodegroupConfig(ctx, &eks.UpdateNodegroupConfigInput{ + ClusterName: awssdk.String(clusterName), + NodegroupName: awssdk.String(nodegroupName), + ScalingConfig: &ekstypes.NodegroupScalingConfig{ + MinSize: awssdk.Int32(0), + MaxSize: awssdk.Int32(maxSize), + DesiredSize: awssdk.Int32(0), + }, + }); err != nil { + return fmt.Errorf("UpdateNodegroupConfig: %w", err) + } + log.Printf("EKS node group %s/%s scaling config set to min=desired=0 (max=%d); waiting for EKS to drain its nodes…", clusterName, nodegroupName, maxSize) + + return waitEKSNodegroupEmpty(ctx, clientset, clusterName, nodegroupName, drainOpts.NodeTimeout, drainOpts.PollInterval) +} + +// waitEKSNodegroupEmpty polls the Kubernetes API for nodes carrying the +// `eks.amazonaws.com/nodegroup=` label until none remain or the deadline +// expires. The EKS drain is bounded by drainOpts.NodeTimeout — for large node +// groups, callers should raise this above the default. +func waitEKSNodegroupEmpty(ctx context.Context, clientset kubernetes.Interface, clusterName, nodegroupName string, timeout, pollInterval time.Duration) error { + selector := commonaws.LabelEKSNodegroup + "=" + nodegroupName + deadline := time.Now().Add(timeout) + for { + list, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: selector}) + if err != nil { + // We already accepted the EKS scaling change, so EKS may still + // be draining. Wrap errEKSDrainIncomplete so the orchestrator + // keeps the temporary PDBs in place; a re-run will reach the + // poll loop again and cleanup once EKS converges. + return fmt.Errorf("list EKS managed node group %s/%s nodes: %w: %w", clusterName, nodegroupName, err, errEKSDrainIncomplete) + } + if len(list.Items) == 0 { + log.Printf("EKS node group %s/%s fully drained.", clusterName, nodegroupName) + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("EKS node group %s/%s: %d node(s) still present after %s: %w", clusterName, nodegroupName, len(list.Items), timeout, errEKSDrainIncomplete) + } + select { + case <-time.After(pollInterval): + case <-ctx.Done(): + // Cancellation (SIGINT) — per design, temp PDBs may leak and + // a subsequent run reaps them via the label selector. Do NOT + // wrap errEKSDrainIncomplete here. + return ctx.Err() + } + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng_test.go new file mode 100644 index 0000000000..fbfa99b5df --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/eks_mng_test.go @@ -0,0 +1,240 @@ +package evict + +import ( + "context" + "errors" + "testing" + "time" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/eks" + ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +type stubEKS struct { + gotInputs []*eks.UpdateNodegroupConfigInput + err error + describedMax int32 // 0 ⇒ stub returns nil ScalingConfig + describeErr error + describeCalled int +} + +func (s *stubEKS) DescribeNodegroup(_ context.Context, in *eks.DescribeNodegroupInput, _ ...func(*eks.Options)) (*eks.DescribeNodegroupOutput, error) { + s.describeCalled++ + if s.describeErr != nil { + return nil, s.describeErr + } + out := &eks.DescribeNodegroupOutput{ + Nodegroup: &ekstypes.Nodegroup{ + ClusterName: in.ClusterName, + NodegroupName: in.NodegroupName, + }, + } + if s.describedMax > 0 { + out.Nodegroup.ScalingConfig = &ekstypes.NodegroupScalingConfig{ + MaxSize: awssdk.Int32(s.describedMax), + } + } + return out, nil +} + +func (s *stubEKS) UpdateNodegroupConfig(_ context.Context, in *eks.UpdateNodegroupConfigInput, _ ...func(*eks.Options)) (*eks.UpdateNodegroupConfigOutput, error) { + s.gotInputs = append(s.gotInputs, in) + return &eks.UpdateNodegroupConfigOutput{}, s.err +} + +func mngDrainOpts() nodeDrainOptions { + return nodeDrainOptions{ + NodeTimeout: 5 * time.Second, + PollInterval: 10 * time.Millisecond, + } +} + +func TestEvictEKSManagedNodeGroup(t *testing.T) { + mngNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ip-1", + Labels: map[string]string{"eks.amazonaws.com/nodegroup": "my-mng"}, + }, + } + apiErr := errors.New("api error") + listErr := errors.New("apiserver unreachable") + + for _, tc := range []struct { + name string + // updateErr is what stubEKS returns from UpdateNodegroupConfig. nil + // ⇒ success. + updateErr error + // describeErr is what stubEKS returns from DescribeNodegroup. nil + // ⇒ success. + describeErr error + // describedMax is the MaxSize reported by the stubbed + // DescribeNodegroup. 0 ⇒ stub returns nil ScalingConfig (simulating + // a node group described without a scaling config). + describedMax int32 + // nodes are pre-loaded into the fake clientset (i.e. nodes that EKS + // would be draining). + nodes []runtime.Object + // listResponder, when non-nil, overrides the default "list nodes" + // behaviour. Used to simulate the EKS-side drain finishing between + // polls, or a transient List error. + listResponder func(call int) (handle bool, resp runtime.Object, err error) + dryRun bool + // shortTimeout uses a 100ms NodeTimeout to force the wait loop to + // give up; default keeps the longer mngDrainOpts() value. + shortTimeout bool + + wantDescribeCallCount int + wantUpdateCallCount int + // wantUpdateMaxSize is the MaxSize the test asserts on the + // UpdateNodegroupConfig input. Only checked when + // wantUpdateCallCount > 0. + wantUpdateMaxSize int32 + wantErr bool + // wantWrapsSentinel asserts errors.Is(err, errEKSDrainIncomplete). + wantWrapsSentinel bool + // wantNotWrapsSentinel asserts errors.Is(err, errEKSDrainIncomplete) is false. + wantNotWrapsSentinel bool + wantErrContains string + }{ + { + name: "dry-run skips Describe + Update", + dryRun: true, + wantDescribeCallCount: 0, + wantUpdateCallCount: 0, + }, + { + name: "DescribeNodegroup failure short-circuits", + describeErr: apiErr, + wantDescribeCallCount: 1, + wantUpdateCallCount: 0, + wantErr: true, + wantErrContains: "DescribeNodegroup", + wantNotWrapsSentinel: true, + }, + { + name: "UpdateNodegroupConfig failure does not wrap sentinel", + describedMax: 5, + updateErr: apiErr, + wantDescribeCallCount: 1, + wantUpdateCallCount: 1, + wantUpdateMaxSize: 5, + wantErr: true, + wantErrContains: "UpdateNodegroupConfig", + wantNotWrapsSentinel: true, + }, + { + // EKS finishes draining between two polls: first List sees the + // node, second returns empty. Counter-based reactor avoids the + // time.Sleep + goroutine race the goroutine version had under + // loaded CI. + name: "waits for nodes to disappear", + nodes: []runtime.Object{mngNode}, + describedMax: 5, + listResponder: func(call int) (bool, runtime.Object, error) { + if call > 1 { + return true, &corev1.NodeList{}, nil + } + return false, nil, nil + }, + wantDescribeCallCount: 1, + wantUpdateCallCount: 1, + wantUpdateMaxSize: 5, + }, + { + // Described scaling config is nil ⇒ Update must still pass an + // API-valid MaxSize (>= 1). Ensures evict never blindly sends 0. + name: "describe without scaling config falls back to max=1", + nodes: []runtime.Object{mngNode}, + describedMax: 0, + shortTimeout: true, + wantDescribeCallCount: 1, + wantUpdateCallCount: 1, + wantUpdateMaxSize: 1, + wantErr: true, + wantErrContains: "my-mng", + wantWrapsSentinel: true, + }, + { + // Wait timeout while nodes persist: orchestrator must keep + // temp PDBs in place ⇒ sentinel wrapped. + name: "wait timeout wraps the sentinel", + nodes: []runtime.Object{mngNode}, + describedMax: 5, + shortTimeout: true, + wantDescribeCallCount: 1, + wantUpdateCallCount: 1, + wantUpdateMaxSize: 5, + wantErr: true, + wantErrContains: "my-mng", + wantWrapsSentinel: true, + }, + { + // EKS scaling change succeeded, but a subsequent K8s Node list + // call errors out. EKS may still be draining ⇒ sentinel wrapped. + name: "post-Update list failure wraps the sentinel", + describedMax: 5, + listResponder: func(_ int) (bool, runtime.Object, error) { return true, nil, listErr }, + wantDescribeCallCount: 1, + wantUpdateCallCount: 1, + wantUpdateMaxSize: 5, + wantErr: true, + wantWrapsSentinel: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + stub := &stubEKS{ + err: tc.updateErr, + describeErr: tc.describeErr, + describedMax: tc.describedMax, + } + client := fake.NewClientset(tc.nodes...) + if tc.listResponder != nil { + var listCalls int + client.PrependReactor("list", "nodes", func(_ clienttesting.Action) (bool, runtime.Object, error) { + listCalls++ + return tc.listResponder(listCalls) + }) + } + opts := mngDrainOpts() + opts.DryRun = tc.dryRun + if tc.shortTimeout { + opts.NodeTimeout = 100 * time.Millisecond + opts.PollInterval = 30 * time.Millisecond + } + + err := evictEKSManagedNodeGroup(t.Context(), stub, client, "my-cluster", "my-mng", opts) + if tc.wantErr { + require.Error(t, err) + if tc.wantErrContains != "" { + assert.Contains(t, err.Error(), tc.wantErrContains) + } + } else { + require.NoError(t, err) + } + if tc.wantWrapsSentinel { + assert.ErrorIs(t, err, errEKSDrainIncomplete) + } + if tc.wantNotWrapsSentinel { + assert.NotErrorIs(t, err, errEKSDrainIncomplete) + } + assert.Equal(t, tc.wantDescribeCallCount, stub.describeCalled) + require.Len(t, stub.gotInputs, tc.wantUpdateCallCount) + for _, in := range stub.gotInputs { + assert.Equal(t, "my-cluster", awssdk.ToString(in.ClusterName)) + assert.Equal(t, "my-mng", awssdk.ToString(in.NodegroupName)) + require.NotNil(t, in.ScalingConfig) + assert.Equal(t, int32(0), awssdk.ToInt32(in.ScalingConfig.MinSize)) + assert.Equal(t, tc.wantUpdateMaxSize, awssdk.ToInt32(in.ScalingConfig.MaxSize)) + assert.Equal(t, int32(0), awssdk.ToInt32(in.ScalingConfig.DesiredSize)) + } + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/evict.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict.go new file mode 100644 index 0000000000..e66d4385b6 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict.go @@ -0,0 +1,157 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + "k8s.io/cli-runtime/pkg/genericclioptions" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" + "github.com/DataDog/datadog-operator/pkg/plugin/common" +) + +var evictExample = ` + # evict every node group that is not Datadog-managed (cluster-autoscaler ASGs, + # EKS managed node groups, user Karpenter NodePools, standalone EC2) + %[1]s evict-legacy-nodes --all + + # evict a single ASG by name + %[1]s evict-legacy-nodes --target=asg/my-legacy-asg + + # preview the actions without performing them + %[1]s evict-legacy-nodes --all --dry-run +` + +type options struct { + genericclioptions.IOStreams + common.Options + args []string + + clusterName string + karpenterNamespace string + all bool + targetSpecs []string + targets []Target // populated by validate() + skipCA bool + ensurePDBs bool + evictionTimeout time.Duration + nodeTimeout time.Duration + dryRun bool + yes bool + debug bool +} + +func newOptions(streams genericclioptions.IOStreams) *options { + o := &options{ + IOStreams: streams, + ensurePDBs: true, + evictionTimeout: 5 * time.Minute, + nodeTimeout: 15 * time.Minute, + } + o.SetConfigFlags() + return o +} + +// New returns the cobra command for `kubectl datadog autoscaling cluster evict-legacy-nodes`. +func New(streams genericclioptions.IOStreams) *cobra.Command { + o := newOptions(streams) + cmd := &cobra.Command{ + Use: "evict-legacy-nodes", + Short: "Drain workloads from non-Datadog node groups onto Datadog-managed Karpenter NodePools", + Example: fmt.Sprintf(evictExample, "kubectl datadog autoscaling cluster"), + SilenceUsage: true, + RunE: func(c *cobra.Command, args []string) error { + if err := o.complete(c, args); err != nil { + return err + } + if err := o.validate(); err != nil { + return err + } + return o.run() + }, + } + + cmd.Flags().StringVar(&o.clusterName, "cluster-name", "", "Name of the EKS cluster") + cmd.Flags().StringVar(&o.karpenterNamespace, "karpenter-namespace", "", "Namespace where Karpenter is deployed (auto-detected when empty)") + cmd.Flags().BoolVar(&o.all, "all", false, "Evict every node group that is not managed by Datadog") + cmd.Flags().StringSliceVar(&o.targetSpecs, "target", nil, "Target a specific node group: /, with one of asg, eksManagedNodeGroup, karpenter. Use `standalone` (no name) for standalone EC2 instances. Repeatable. Mutually exclusive with --all.") + cmd.Flags().BoolVar(&o.skipCA, "skip-cluster-autoscaler", false, "Do not scale the cluster-autoscaler Deployment to 0 replicas as step 1") + cmd.Flags().BoolVar(&o.ensurePDBs, "ensure-pdbs", true, "Create temporary PodDisruptionBudgets (maxUnavailable: 1) for workloads without one, and remove them at the end") + cmd.Flags().DurationVar(&o.evictionTimeout, "eviction-timeout", 5*time.Minute, "Time budget per pod for the Eviction API to succeed before giving up (PDB-blocked pods)") + cmd.Flags().DurationVar(&o.nodeTimeout, "node-timeout", 15*time.Minute, "Time budget per node for it to become empty after pods have been evicted") + cmd.Flags().BoolVar(&o.dryRun, "dry-run", false, "Log the actions that would be taken without performing them") + cmd.Flags().BoolVar(&o.yes, "yes", false, "Skip the confirmation prompt") + cmd.Flags().BoolVar(&o.debug, "debug", false, "Enable debug logs") + + o.ConfigFlags.AddFlags(cmd.Flags()) + + return cmd +} + +func (o *options) complete(cmd *cobra.Command, args []string) error { + o.args = args + return o.Init(cmd) +} + +func (o *options) validate() error { + if len(o.args) > 0 { + return errors.New("no positional arguments are allowed") + } + if o.all && len(o.targetSpecs) > 0 { + return errors.New("--all and --target are mutually exclusive") + } + if !o.all && len(o.targetSpecs) == 0 { + return errors.New("at least one of --all or --target must be provided") + } + if o.evictionTimeout <= 0 { + return errors.New("--eviction-timeout must be positive") + } + if o.nodeTimeout <= 0 { + return errors.New("--node-timeout must be positive") + } + var ( + parsed []Target + errs []error + ) + for _, spec := range o.targetSpecs { + if t, err := ParseTargetSpec(spec); err != nil { + errs = append(errs, err) + } else { + parsed = append(parsed, t) + } + } + if len(errs) > 0 { + return errors.Join(errs...) + } + o.targets = parsed + return nil +} + +func (o *options) run() error { + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + clusterName, err := clients.ResolveClusterName(o.ConfigFlags, o.clusterName) + if err != nil { + return err + } + + return Run(ctx, o.IOStreams, o.ConfigFlags, o.Clientset, RunOptions{ + ClusterName: clusterName, + KarpenterNamespace: o.karpenterNamespace, + All: o.all, + Targets: o.targets, + SkipCA: o.skipCA, + EnsurePDBs: o.ensurePDBs, + DryRun: o.dryRun, + Yes: o.yes, + EvictionTimeout: o.evictionTimeout, + NodeTimeout: o.nodeTimeout, + Debug: o.debug, + }) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods.go new file mode 100644 index 0000000000..c3496fc109 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods.go @@ -0,0 +1,172 @@ +package evict + +import ( + "context" + "fmt" + "log" + "slices" + "time" + + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/pager" +) + +// nodeDrainOptions captures the per-call tunables for evicting a node's pods. +type nodeDrainOptions struct { + DryRun bool + EvictionTimeout time.Duration // per pod, bound for retries on 429 + NodeTimeout time.Duration // total wait for the node to become empty + PollInterval time.Duration // interval between empty-checks; default 2s +} + +// drainNode evicts every evictable pod from the node and waits for the node to +// become empty. Pods owned by a DaemonSet, mirror pods, terminating pods and +// completed Job pods are skipped — the kubelet handles their cleanup when the +// underlying instance disappears. +// +// Pods that cannot be evicted (PDB-blocked beyond EvictionTimeout, etc.) are +// logged as warnings; drainNode then continues with the remaining pods rather +// than aborting the whole run. +func drainNode(ctx context.Context, clientset kubernetes.Interface, nodeName string, opts nodeDrainOptions) error { + if opts.DryRun { + log.Printf("[dry-run] would drain node %s", nodeName) + return nil + } + pods, err := listPodsOnNode(ctx, clientset, nodeName) + if err != nil { + return fmt.Errorf("failed to list pods on node %s: %w", nodeName, err) + } + for _, p := range pods { + if shouldSkipEviction(&p) { + continue + } + if err := evictPodWithRetry(ctx, clientset, &p, opts.EvictionTimeout, opts.PollInterval); err != nil { + log.Printf("Warning: pod %s/%s: %v", p.Namespace, p.Name, err) + } + } + return waitForNodeEmpty(ctx, clientset, nodeName, opts.NodeTimeout, opts.PollInterval) +} + +// listPodsOnNode enumerates pods scheduled on the given node, server-side +// filtered via the spec.nodeName field selector. Uses the client-go pager +// defaults so very large nodes (250 pods+) don't trigger oversized list calls. +func listPodsOnNode(ctx context.Context, clientset kubernetes.Interface, nodeName string) (pods []corev1.Pod, err error) { + p := pager.New(func(ctx context.Context, opts metav1.ListOptions) (runtime.Object, error) { + return clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, opts) + }) + if err = p.EachListItem(ctx, metav1.ListOptions{ + FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nodeName).String(), + }, func(obj runtime.Object) error { + pods = append(pods, *obj.(*corev1.Pod)) + return nil + }); err != nil { + return nil, err + } + return pods, nil +} + +// evictPodWithRetry sends a single Eviction request and retries while the +// apiserver returns 429 TooManyRequests (the canonical PDB-blocked signal). +// Aborts after timeout; the caller then logs and moves to the next pod. +// +// 404 (pod already gone) is treated as success — the eviction goal is met. +// Non-429 errors are returned immediately. +func evictPodWithRetry(ctx context.Context, clientset kubernetes.Interface, p *corev1.Pod, timeout, retryInterval time.Duration) error { + eviction := &policyv1.Eviction{ + ObjectMeta: metav1.ObjectMeta{Name: p.Name, Namespace: p.Namespace}, + } + deadline := time.Now().Add(timeout) + for { + err := clientset.CoreV1().Pods(p.Namespace).EvictV1(ctx, eviction) + switch { + case err == nil: + log.Printf("Evicted pod %s/%s.", p.Namespace, p.Name) + return nil + case apierrors.IsNotFound(err): + return nil + case !apierrors.IsTooManyRequests(err): + return fmt.Errorf("eviction failed: %w", err) + case time.Now().After(deadline): + return fmt.Errorf("eviction timed out (likely PDB-blocked): %w", err) + default: + select { + case <-time.After(retryInterval): + case <-ctx.Done(): + return ctx.Err() + } + } + } +} + +// waitForNodeEmpty polls the node until no pod still occupies it. A pod is +// considered to still occupy the node until it is actually gone from the API +// server — including pods that are merely terminating (DeletionTimestamp +// set). This matters for callers that terminate the underlying EC2 instance +// next: returning early while a pod is mid-grace-period would kill the +// container before its preStop hook / graceful shutdown finishes. +// +// DaemonSet pods, mirror pods and completed pods are not counted: DS and +// mirror pods are tied to the node's lifecycle and disappear with it, +// completed pods are inert and GC'd promptly. +func waitForNodeEmpty(ctx context.Context, clientset kubernetes.Interface, nodeName string, timeout, pollInterval time.Duration) error { + deadline := time.Now().Add(timeout) + for { + pods, err := listPodsOnNode(ctx, clientset, nodeName) + if err != nil { + return fmt.Errorf("failed to list pods on node %s: %w", nodeName, err) + } + remaining := lo.CountBy(pods, func(p corev1.Pod) bool { + return podOccupiesNode(&p) + }) + if remaining == 0 { + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out waiting for node %s to drain: %d pod(s) still present", nodeName, remaining) + } + select { + case <-time.After(pollInterval): + case <-ctx.Done(): + return ctx.Err() + } + } +} + +// shouldSkipEviction reports whether drainNode should leave a pod alone +// rather than call the Eviction API on it. A pod already terminating is +// skipped because issuing another eviction would 404 once the kubelet +// finishes the deletion; DaemonSet/mirror/completed pods are skipped because +// they cannot or need not be evicted. +func shouldSkipEviction(p *corev1.Pod) bool { + return p.DeletionTimestamp != nil || isMirrorPod(p) || isDaemonSetPod(p) || isCompleted(p) +} + +// podOccupiesNode reports whether a pod still consumes the node from a drain +// perspective. Terminating pods DO occupy the node until they are gone; +// returning false for them prematurely would let the caller terminate the +// instance before the container finished its grace period. +func podOccupiesNode(p *corev1.Pod) bool { + return !isMirrorPod(p) && !isDaemonSetPod(p) && !isCompleted(p) +} + +func isMirrorPod(p *corev1.Pod) bool { + _, ok := p.Annotations[corev1.MirrorPodAnnotationKey] + return ok +} + +func isDaemonSetPod(p *corev1.Pod) bool { + return slices.ContainsFunc(p.OwnerReferences, func(owner metav1.OwnerReference) bool { + return owner.Kind == "DaemonSet" + }) +} + +func isCompleted(p *corev1.Pod) bool { + return p.Status.Phase == corev1.PodSucceeded || p.Status.Phase == corev1.PodFailed +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods_test.go new file mode 100644 index 0000000000..c6f386f5b2 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/evict_pods_test.go @@ -0,0 +1,214 @@ +package evict + +import ( + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +// installPodEvictionReactor makes a fake clientset accept Eviction +// subresource creates and echo the eviction object back. The Pod itself is +// NOT removed from the tracker — tests that want a pod-removal effect (e.g. +// to make drainNode observe an empty node) must add a "delete pods" reactor +// explicitly, or use a fixture that starts with no pod on the node. +func installPodEvictionReactor(client *fake.Clientset) { + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + ca, ok := action.(clienttesting.CreateAction) + if !ok || ca.GetSubresource() != "eviction" { + return false, nil, nil + } + return true, ca.GetObject(), nil + }) +} + +func TestShouldSkipEviction(t *testing.T) { + now := metav1.Now() + for _, tc := range []struct { + name string + pod *corev1.Pod + skip bool + }{ + {name: "regular pod", pod: &corev1.Pod{}, skip: false}, + {name: "terminating", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &now}}, skip: true}, + {name: "mirror", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{corev1.MirrorPodAnnotationKey: "x"}, + }}, skip: true}, + {name: "daemonset", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{{Kind: "DaemonSet", Name: "ds"}}, + }}, skip: true}, + {name: "succeeded job", pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodSucceeded}}, skip: true}, + {name: "failed job", pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodFailed}}, skip: true}, + } { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.skip, shouldSkipEviction(tc.pod)) + }) + } +} + +// TestPodOccupiesNode locks in the asymmetry between "skip eviction" and +// "keeps the node busy": terminating pods are skipped from eviction (the +// kubelet is already deleting them) BUT still occupy the node for drain +// purposes (otherwise we'd terminate the instance mid-grace-period and kill +// the container before its preStop hook finishes). +func TestPodOccupiesNode(t *testing.T) { + now := metav1.Now() + for _, tc := range []struct { + name string + pod *corev1.Pod + occupies bool + }{ + {name: "regular pod", pod: &corev1.Pod{}, occupies: true}, + {name: "terminating pod still occupies", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &now}}, occupies: true}, + {name: "mirror pod", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{corev1.MirrorPodAnnotationKey: "x"}, + }}, occupies: false}, + {name: "daemonset pod", pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{{Kind: "DaemonSet", Name: "ds"}}, + }}, occupies: false}, + {name: "succeeded job", pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodSucceeded}}, occupies: false}, + {name: "failed job", pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodFailed}}, occupies: false}, + } { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.occupies, podOccupiesNode(tc.pod)) + }) + } +} + +func TestEvictPodWithRetry(t *testing.T) { + // evictionResponder shapes the test-side of the Eviction subresource + // reactor. `call` is the 1-based call counter. + type evictionResponder func(call int, eviction *policyv1.Eviction) (resp runtime.Object, err error) + + echoSuccess := evictionResponder(func(_ int, e *policyv1.Eviction) (runtime.Object, error) { + return e, nil + }) + tooManyOnceThenSuccess := evictionResponder(func(call int, e *policyv1.Eviction) (runtime.Object, error) { + if call == 1 { + return nil, apierrors.NewTooManyRequests("PDB blocked", 1) + } + return e, nil + }) + notFound := evictionResponder(func(_ int, e *policyv1.Eviction) (runtime.Object, error) { + return nil, apierrors.NewNotFound(schema.GroupResource{Resource: "pods"}, e.Name) + }) + nonRetryable := evictionResponder(func(_ int, _ *policyv1.Eviction) (runtime.Object, error) { + return nil, errors.New("non-retryable") + }) + + for _, tc := range []struct { + name string + // pod is the object passed to evictPodWithRetry (and pre-loaded + // into the fake when seedPod is true). + pod *corev1.Pod + seedPod bool + // responder controls what the Eviction reactor returns each call. + responder evictionResponder + timeout time.Duration + + wantErr bool + wantErrContains string + wantMinCalls int + wantCapturedName string + wantCapturedNs string + }{ + { + name: "success on first call", + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "default"}}, + seedPod: true, + responder: echoSuccess, + timeout: 5 * time.Second, + wantMinCalls: 1, + }, + { + name: "retries on 429 then succeeds", + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "default"}}, + seedPod: true, + responder: tooManyOnceThenSuccess, + timeout: 10 * time.Second, + wantMinCalls: 2, + }, + { + // Pod already gone: 404 from the apiserver is treated as + // success — the eviction goal is met regardless. + name: "404 from apiserver is success", + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "default"}}, + seedPod: false, + responder: notFound, + timeout: time.Second, + wantMinCalls: 1, + }, + { + name: "non-retryable error returned", + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "default"}}, + seedPod: true, + responder: nonRetryable, + timeout: 5 * time.Second, + wantErr: true, + wantErrContains: "eviction failed", + wantMinCalls: 1, + }, + { + // Smoke check that the Eviction object carries the pod's + // Name/Namespace through to the apiserver. + name: "eviction object name/namespace", + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "ns1"}}, + seedPod: true, + responder: echoSuccess, + timeout: time.Second, + wantMinCalls: 1, + wantCapturedName: "p1", + wantCapturedNs: "ns1", + }, + } { + t.Run(tc.name, func(t *testing.T) { + var seed []runtime.Object + if tc.seedPod { + seed = append(seed, tc.pod) + } + client := fake.NewClientset(seed...) + + var ( + calls int + captured *policyv1.Eviction + ) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + ca, ok := action.(clienttesting.CreateAction) + if !ok || ca.GetSubresource() != "eviction" { + return false, nil, nil + } + calls++ + eviction := ca.GetObject().(*policyv1.Eviction) + captured = eviction + resp, err := tc.responder(calls, eviction) + return true, resp, err + }) + + err := evictPodWithRetry(t.Context(), client, tc.pod, tc.timeout, 10*time.Millisecond) + if tc.wantErr { + require.Error(t, err) + if tc.wantErrContains != "" { + assert.Contains(t, err.Error(), tc.wantErrContains) + } + } else { + require.NoError(t, err) + } + assert.GreaterOrEqual(t, calls, tc.wantMinCalls) + if tc.wantCapturedName != "" { + require.NotNil(t, captured) + assert.Equal(t, tc.wantCapturedName, captured.Name) + assert.Equal(t, tc.wantCapturedNs, captured.Namespace) + } + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user.go new file mode 100644 index 0000000000..1473a2aa01 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user.go @@ -0,0 +1,39 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + + "k8s.io/client-go/kubernetes" +) + +// evictKarpenterUserNodePool cordons and drains the nodes managed by a user- +// created Karpenter NodePool. Once each node is cordoned and empty, Karpenter +// itself observes the state and terminates the underlying NodeClaim, and the +// pods re-scheduled by their controllers will be placed by Karpenter on the +// best-matching NodePool (ideally one managed by Datadog, if its spec.weight +// is high enough). +// +// The user NodePool spec is NOT modified — this command intentionally leaves +// user-managed NodePool configuration intact. If a user NodePool has a +// spec.weight greater than or equal to the Datadog NodePool weight, evicted +// pods may land on a freshly provisioned node from the SAME user NodePool. +// That case is surfaced by a pre-flight warning in the orchestrator; this +// function makes no attempt to re-balance the weights. +func evictKarpenterUserNodePool(ctx context.Context, clientset kubernetes.Interface, nodePoolName string, nodes []string, drainOpts nodeDrainOptions) error { + // Cordon every node up front so a pod evicted from one node is never + // rescheduled onto another node of the same NodePool that is itself about + // to be drained. + cordoned, errs := cordonNodes(ctx, clientset, nodes, drainOpts.DryRun) + for _, node := range cordoned { + if err := drainNode(ctx, clientset, node.Name, drainOpts); err != nil { + errs = append(errs, fmt.Errorf("drain node %s: %w", node.Name, err)) + } + } + if !drainOpts.DryRun && len(errs) == 0 { + log.Printf("Drained %d node(s) from user NodePool %s; Karpenter will terminate their NodeClaims once empty.", len(cordoned), nodePoolName) + } + return errors.Join(errs...) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user_test.go new file mode 100644 index 0000000000..1f2bbff530 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/karpenter_user_test.go @@ -0,0 +1,33 @@ +package evict + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func TestEvictKarpenterUserNodePool(t *testing.T) { + for _, tc := range []struct { + name string + dryRun bool + wantUnschedulable bool + }{ + {name: "cordons and drains", dryRun: false, wantUnschedulable: true}, + {name: "dry-run touches nothing", dryRun: true, wantUnschedulable: false}, + } { + t.Run(tc.name, func(t *testing.T) { + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "n1"}} + client := fake.NewClientset(node) + + require.NoError(t, evictKarpenterUserNodePool(t.Context(), client, "user-np", []string{"n1"}, newDrainOpts(tc.dryRun))) + + got, err := client.CoreV1().Nodes().Get(t.Context(), "n1", metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, tc.wantUnschedulable, got.Spec.Unschedulable) + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb.go new file mode 100644 index 0000000000..9271184939 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb.go @@ -0,0 +1,355 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + "reflect" + "slices" + "strings" + + "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/validation" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/pager" + "sigs.k8s.io/controller-runtime/pkg/client" + + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" +) + +// Label keys used to mark PodDisruptionBudgets created by this command. Both +// must be present for cleanup to consider the PDB ours — this makes the +// cleanup safe against accidentally removing a user PDB with a colliding name. +const ( + pdbManagedByLabelKey = "app.kubernetes.io/managed-by" + pdbManagedByLabelValue = "kubectl-datadog" + pdbTempLabelKey = "autoscaling.datadoghq.com/temporary-pdb" + pdbTempLabelValue = "true" + // pdbNameSuffix is appended to the controller name to form the temp PDB + // name. Kept short so that long controller names stay under the 63-char + // DNS label limit after truncation. + pdbNameSuffix = "-evict-legacy" +) + +// ensureTempPDBs scans the pods running on the nodes of every target and +// creates a temporary PodDisruptionBudget (maxUnavailable: 1) for each +// top-level controller (Deployment, StatefulSet, bare ReplicaSet) that does +// not already have one with a matching selector. +// +// The created PDBs carry two labels (managed-by + temporary-pdb) that the +// cleanup step uses to find and delete them, regardless of which process +// created them. ensureTempPDBs itself is idempotent: a PDB created by a +// previous (possibly crashed) run is detected by its labels and left alone. +func ensureTempPDBs(ctx context.Context, clientset kubernetes.Interface, ctrlClient client.Client, targets []Target, dryRun bool) error { + // Targets across all manager types are merged: the orchestrator blocks + // on waitEKSNodegroupEmpty before cleaning up the temporary PDBs, so EKS + // MNG nodes observe the PDBs during their drain too — without that, EKS + // could disrupt every replica of an otherwise unprotected workload at + // once when all replicas happen to live on the same node group. + allNodes := lo.FlatMap(targets, func(t Target, _ int) []string { return t.Nodes }) + nodeSet := lo.SliceToMap(allNodes, func(n string) (string, struct{}) { return n, struct{}{} }) + if len(nodeSet) == 0 { + return nil + } + + controllers, err := discoverControllers(ctx, clientset, nodeSet) + if err != nil { + return fmt.Errorf("failed to discover controllers: %w", err) + } + if len(controllers) == 0 { + return nil + } + + // Group controllers by namespace to amortize the per-namespace PDB list. + byNamespace := make(map[string][]controllerInfo) + for _, c := range controllers { + byNamespace[c.Namespace] = append(byNamespace[c.Namespace], c) + } + + var errs []error + for ns, ctrls := range byNamespace { + existing, err := listNamespacePDBs(ctx, clientset, ns) + if err != nil { + errs = append(errs, fmt.Errorf("namespace %s: failed to list PDBs: %w", ns, err)) + continue + } + for _, c := range ctrls { + if hasUserPDB(existing, c.Selector) { + continue + } + if err := createTempPDB(ctx, ctrlClient, c, dryRun); err != nil { + errs = append(errs, fmt.Errorf("controller %s/%s/%s: %w", c.Namespace, c.Kind, c.Name, err)) + } + } + } + return errors.Join(errs...) +} + +// cleanupTempPDBs deletes every PodDisruptionBudget cluster-wide that carries +// both temp-PDB labels. Listing by labels (not by a state struct returned from +// ensureTempPDBs) is what makes the command crash-safe: a re-run after a kill +// still finds and removes the orphans left by the previous attempt. +func cleanupTempPDBs(ctx context.Context, ctrlClient client.Client, dryRun bool) error { + list := &policyv1.PodDisruptionBudgetList{} + if err := ctrlClient.List(ctx, list, client.MatchingLabels{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }); err != nil { + return fmt.Errorf("failed to list temporary PDBs: %w", err) + } + if len(list.Items) == 0 { + return nil + } + var errs []error + for i := range list.Items { + pdb := &list.Items[i] + if dryRun { + log.Printf("[dry-run] would delete PDB %s/%s", pdb.Namespace, pdb.Name) + continue + } + if err := commonk8s.Delete(ctx, ctrlClient, pdb); err != nil { + errs = append(errs, fmt.Errorf("PDB %s/%s: %w", pdb.Namespace, pdb.Name, err)) + } + } + return errors.Join(errs...) +} + +// controllerKey identifies a top-level controller that owns evictable pods on +// our target nodes. It is the dedup key in the seen map and the identity half +// of controllerInfo. +type controllerKey struct { + Namespace string + Kind string // "Deployment", "StatefulSet", "ReplicaSet" + Name string +} + +// controllerInfo is a controllerKey plus the controller's pod selector — what a +// PDB would match on. +type controllerInfo struct { + controllerKey + Selector *metav1.LabelSelector +} + +// discoverControllers lists every Pod cluster-wide once (paginated) and, for +// each Pod scheduled on one of the target nodes, resolves the top-level +// controller. Listing once and filtering client-side avoids the N-API-calls +// problem of doing one List per node, which on a large legacy fleet would +// dominate the command's wall-clock time. The resulting slice contains each +// controller at most once. +func discoverControllers(ctx context.Context, clientset kubernetes.Interface, nodeSet map[string]struct{}) ([]controllerInfo, error) { + seen := make(map[controllerKey]*metav1.LabelSelector) + depCache := make(map[client.ObjectKey]*appsv1.Deployment) + rsCache := make(map[client.ObjectKey]*appsv1.ReplicaSet) + stsCache := make(map[client.ObjectKey]*appsv1.StatefulSet) + + p := pager.New(func(ctx context.Context, opts metav1.ListOptions) (runtime.Object, error) { + return clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, opts) + }) + if err := p.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error { + pod := obj.(*corev1.Pod) + if _, onTarget := nodeSet[pod.Spec.NodeName]; !onTarget { + return nil + } + if shouldSkipEviction(pod) { + return nil + } + info, err := resolveTopLevelController(ctx, clientset, pod, depCache, rsCache, stsCache) + if err != nil { + log.Printf("Warning: cannot resolve controller for pod %s/%s: %v", pod.Namespace, pod.Name, err) + return nil + } + if info == nil { + return nil + } + seen[info.controllerKey] = info.Selector + return nil + }); err != nil { + return nil, fmt.Errorf("list pods: %w", err) + } + + return lo.MapToSlice(seen, func(key controllerKey, selector *metav1.LabelSelector) controllerInfo { + return controllerInfo{controllerKey: key, Selector: selector} + }), nil +} + +// resolveTopLevelController walks a Pod's owner chain up to the workload +// controller (Deployment > ReplicaSet > Pod, StatefulSet > Pod). Returns nil +// for Pods whose top-level owner is not a stable workload — Jobs (TTL-managed), +// DaemonSets (already skipped at eviction), or bare Pods. +func resolveTopLevelController( + ctx context.Context, + clientset kubernetes.Interface, + pod *corev1.Pod, + depCache map[client.ObjectKey]*appsv1.Deployment, + rsCache map[client.ObjectKey]*appsv1.ReplicaSet, + stsCache map[client.ObjectKey]*appsv1.StatefulSet, +) (*controllerInfo, error) { + owner := metav1.GetControllerOf(pod) + if owner == nil { + return nil, nil + } + switch owner.Kind { + case "ReplicaSet": + rs, err := getCached(ctx, rsCache, pod.Namespace, owner.Name, clientset.AppsV1().ReplicaSets(pod.Namespace).Get) + if err != nil { + return nil, err + } + rsOwner := metav1.GetControllerOf(rs) + if rsOwner != nil && rsOwner.Kind == "Deployment" { + dep, err := getCached(ctx, depCache, pod.Namespace, rsOwner.Name, clientset.AppsV1().Deployments(pod.Namespace).Get) + if err != nil { + return nil, err + } + return &controllerInfo{ + controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "Deployment", Name: dep.Name}, + Selector: dep.Spec.Selector, + }, nil + } + return &controllerInfo{ + controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "ReplicaSet", Name: rs.Name}, + Selector: rs.Spec.Selector, + }, nil + case "StatefulSet": + sts, err := getCached(ctx, stsCache, pod.Namespace, owner.Name, clientset.AppsV1().StatefulSets(pod.Namespace).Get) + if err != nil { + return nil, err + } + return &controllerInfo{ + controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "StatefulSet", Name: sts.Name}, + Selector: sts.Spec.Selector, + }, nil + default: + // DaemonSet (skipped before reaching here), Job (TTL), CronJob, + // custom controllers — none get a temporary PDB. + return nil, nil + } +} + +// getCached returns the object identified by (ns, name) from cache, fetching it +// via get — the namespace-bound typed clientset accessor, e.g. +// clientset.AppsV1().ReplicaSets(ns).Get — and populating the cache on a miss. +// T is inferred from the cache's value type, collapsing the per-kind getters +// into a single generic lookup. +func getCached[T any]( + ctx context.Context, + cache map[client.ObjectKey]*T, + ns, name string, + get func(ctx context.Context, name string, opts metav1.GetOptions) (*T, error), +) (*T, error) { + key := client.ObjectKey{Namespace: ns, Name: name} + if obj, ok := cache[key]; ok { + return obj, nil + } + obj, err := get(ctx, name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + cache[key] = obj + return obj, nil +} + +// listNamespacePDBs returns every PDB in the namespace. Used to detect +// pre-existing user PDBs covering a controller we'd otherwise PDB-protect. +func listNamespacePDBs(ctx context.Context, clientset kubernetes.Interface, namespace string) ([]policyv1.PodDisruptionBudget, error) { + list, err := clientset.PolicyV1().PodDisruptionBudgets(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + return list.Items, nil +} + +// hasUserPDB returns true when an existing non-temporary PDB has the same +// selector as the controller's pod selector. This is a conservative +// equality check: a broader user PDB will NOT be detected, and we'll create +// our own. Eviction will then respect the most restrictive of the two, +// preserving the user's intent. +func hasUserPDB(existing []policyv1.PodDisruptionBudget, controllerSelector *metav1.LabelSelector) bool { + if controllerSelector == nil { + return false + } + return slices.ContainsFunc(existing, func(pdb policyv1.PodDisruptionBudget) bool { + return !isTemporaryPDB(&pdb) && reflect.DeepEqual(pdb.Spec.Selector, controllerSelector) + }) +} + +func isTemporaryPDB(pdb *policyv1.PodDisruptionBudget) bool { + return pdb.Labels[pdbManagedByLabelKey] == pdbManagedByLabelValue && + pdb.Labels[pdbTempLabelKey] == pdbTempLabelValue +} + +// createTempPDB writes (or no-ops if our PDB already exists) a temporary +// PodDisruptionBudget with maxUnavailable: 1. Existing PDBs that aren't ours +// at the same name are left alone with a logged warning — that's a name +// collision the user must resolve. +func createTempPDB(ctx context.Context, ctrlClient client.Client, c controllerInfo, dryRun bool) error { + name := tempPDBName(c.Kind, c.Name) + if dryRun { + log.Printf("[dry-run] would create PDB %s/%s (maxUnavailable: 1, selector: %s)", c.Namespace, name, formatSelector(c.Selector)) + return nil + } + + existing := &policyv1.PodDisruptionBudget{} + err := ctrlClient.Get(ctx, client.ObjectKey{Namespace: c.Namespace, Name: name}, existing) + switch { + case err == nil: + if !isTemporaryPDB(existing) { + log.Printf("Warning: PDB %s/%s exists but is not labelled as temporary; leaving it untouched", c.Namespace, name) + return nil + } + // Our PDB from a previous (possibly crashed) run. Leave as-is; the + // cleanup step will remove it at the end of the current run. + return nil + case !apierrors.IsNotFound(err): + return fmt.Errorf("failed to get PDB %s/%s: %w", c.Namespace, name, err) + } + + maxUnavailable := intstr.FromInt(1) + pdb := &policyv1.PodDisruptionBudget{ + TypeMeta: metav1.TypeMeta{APIVersion: "policy/v1", Kind: "PodDisruptionBudget"}, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: c.Namespace, + Labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }, + }, + Spec: policyv1.PodDisruptionBudgetSpec{ + Selector: c.Selector.DeepCopy(), + MaxUnavailable: &maxUnavailable, + }, + } + if err := ctrlClient.Create(ctx, pdb); err != nil { + return fmt.Errorf("failed to create PDB %s/%s: %w", c.Namespace, name, err) + } + log.Printf("Created temporary PDB %s/%s for %s/%s (maxUnavailable: 1).", c.Namespace, name, c.Kind, c.Name) + return nil +} + +// tempPDBName builds a DNS-label-safe PDB name. Long controller names are +// truncated so the final name (including the suffix) fits the 63-char limit. +func tempPDBName(kind, controllerName string) string { + prefix := strings.ToLower(kind) + "-" + controllerName + if len(prefix)+len(pdbNameSuffix) > validation.DNS1123LabelMaxLength { + prefix = prefix[:validation.DNS1123LabelMaxLength-len(pdbNameSuffix)] + } + return prefix + pdbNameSuffix +} + +func formatSelector(s *metav1.LabelSelector) string { + if s == nil { + return "" + } + parts := lo.MapToSlice(s.MatchLabels, func(k, v string) string { + return k + "=" + v + }) + return strings.Join(parts, ",") +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb_test.go new file mode 100644 index 0000000000..c5fbd3d09a --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/pdb_test.go @@ -0,0 +1,367 @@ +package evict + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func newCtrlScheme(t *testing.T) *runtime.Scheme { + t.Helper() + sch := runtime.NewScheme() + require.NoError(t, scheme.AddToScheme(sch)) + return sch +} + +func TestTempPDBName(t *testing.T) { + for _, tc := range []struct { + kind string + name string + expect string + }{ + {"Deployment", "my-app", "deployment-my-app-evict-legacy"}, + {"StatefulSet", "db", "statefulset-db-evict-legacy"}, + // Long name that must be truncated to fit 63 chars + {"Deployment", strings.Repeat("a", 80), "deployment-" + strings.Repeat("a", 63-len("deployment-")-len(pdbNameSuffix)) + pdbNameSuffix}, + } { + got := tempPDBName(tc.kind, tc.name) + assert.LessOrEqual(t, len(got), 63, "tempPDBName output exceeds 63 chars: %s", got) + assert.Equal(t, tc.expect, got) + } +} + +func TestIsTemporaryPDB(t *testing.T) { + for _, tc := range []struct { + name string + labels map[string]string + want bool + }{ + { + name: "both labels present", + labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }, + want: true, + }, + {name: "no labels", labels: map[string]string{"app": "x"}, want: false}, + { + name: "only managed-by label, must require BOTH", + labels: map[string]string{pdbManagedByLabelKey: pdbManagedByLabelValue}, + want: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + pdb := &policyv1.PodDisruptionBudget{ObjectMeta: metav1.ObjectMeta{Labels: tc.labels}} + assert.Equal(t, tc.want, isTemporaryPDB(pdb)) + }) + } +} + +func TestHasUserPDB(t *testing.T) { + matchingSelector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "x"}} + otherSelector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "y"}} + userPDB := policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "user", Namespace: "default"}, + Spec: policyv1.PodDisruptionBudgetSpec{Selector: matchingSelector}, + } + tempPDB := policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "temp", Namespace: "default", Labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }}, + Spec: policyv1.PodDisruptionBudgetSpec{Selector: matchingSelector}, + } + + for _, tc := range []struct { + name string + existing []policyv1.PodDisruptionBudget + controllerSelector *metav1.LabelSelector + want bool + }{ + {name: "matching user PDB", existing: []policyv1.PodDisruptionBudget{userPDB}, controllerSelector: matchingSelector, want: true}, + {name: "temp PDB ignored", existing: []policyv1.PodDisruptionBudget{tempPDB}, controllerSelector: matchingSelector, want: false}, + {name: "nil controller selector", existing: []policyv1.PodDisruptionBudget{userPDB}, controllerSelector: nil, want: false}, + {name: "different selector", existing: []policyv1.PodDisruptionBudget{userPDB}, controllerSelector: otherSelector, want: false}, + } { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, hasUserPDB(tc.existing, tc.controllerSelector)) + }) + } +} + +func TestCleanupTempPDBs(t *testing.T) { + tempPDB := func() *policyv1.PodDisruptionBudget { + return &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: "temp", Namespace: "default", + Labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }, + }, + } + } + userPDB := &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "user", Namespace: "default"}, + } + otherManaged := &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other", Namespace: "kube-system", + Labels: map[string]string{pdbManagedByLabelKey: "some-other-tool"}, + }, + } + + for _, tc := range []struct { + name string + // existing seeds the controller-runtime fake client. + existing []ctrlclient.Object + dryRun bool + // wantRemaining is the expected `/` set after cleanup. + wantRemaining []string + }{ + { + // Only PDBs carrying BOTH temp labels are removed; user PDBs + // and PDBs from other tools stay put. + name: "deletes only fully-labelled temp PDBs", + existing: []ctrlclient.Object{tempPDB(), userPDB, otherManaged}, + wantRemaining: []string{"default/user", "kube-system/other"}, + }, + { + name: "no temp PDBs is a no-op", + existing: []ctrlclient.Object{userPDB}, + wantRemaining: []string{"default/user"}, + }, + { + name: "dry-run does not delete", + existing: []ctrlclient.Object{tempPDB()}, + dryRun: true, + wantRemaining: []string{"default/temp"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + cli := ctrlfake.NewClientBuilder().WithScheme(newCtrlScheme(t)).WithObjects(tc.existing...).Build() + + require.NoError(t, cleanupTempPDBs(t.Context(), cli, tc.dryRun)) + + list := &policyv1.PodDisruptionBudgetList{} + require.NoError(t, cli.List(t.Context(), list)) + names := make([]string, 0, len(list.Items)) + for _, p := range list.Items { + names = append(names, p.Namespace+"/"+p.Name) + } + assert.ElementsMatch(t, tc.wantRemaining, names) + }) + } +} + +func TestCreateTempPDB(t *testing.T) { + appController := controllerInfo{ + controllerKey: controllerKey{Namespace: "default", Kind: "Deployment", Name: "app"}, + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "x"}}, + } + existingTempPDB := func() *policyv1.PodDisruptionBudget { + return &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: tempPDBName("Deployment", "app"), Namespace: "default", + Labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }, + }, + Spec: policyv1.PodDisruptionBudgetSpec{ + MaxUnavailable: ptr.To(intstr.FromInt(1)), + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "x"}}, + }, + } + } + userPDBAtSameName := &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: tempPDBName("Deployment", "app"), Namespace: "default", + // no temp-pdb labels: looks like a user PDB that happens to share + // our naming. + }, + } + + // ownership describes the expected ownership of the surviving PDB. + type ownership int + const ( + expectNoPDB ownership = iota // 0 PDBs in cluster + expectOurs // 1 PDB, with our temp labels + MaxUnavailable=1 + expectUserPDB // 1 PDB, NOT carrying our labels + expectAny // 1 PDB, ownership irrelevant (idempotent case) + ) + + for _, tc := range []struct { + name string + // existing seeds the controller-runtime fake client. + existing []ctrlclient.Object + dryRun bool + // expect describes the expected post-create cluster state. + expect ownership + // runCleanupAfter, when true, runs cleanupTempPDBs after createTempPDB + // and asserts the cluster ends up empty — exercises the crash-recovery + // loop where a previous run left a temp PDB behind. + runCleanupAfter bool + }{ + { + name: "creates PDB when missing", + expect: expectOurs, + }, + { + // Crash-recovery scenario: a previous run created a temp PDB + // and was killed. A fresh ensure must NOT duplicate it, and + // cleanup must still remove it. + name: "idempotent on pre-existing temp PDB", + existing: []ctrlclient.Object{existingTempPDB()}, + expect: expectAny, + runCleanupAfter: true, + }, + { + // User-owned PDB at the same name (no temp labels) must be + // left untouched. + name: "name collision with user PDB leaves it untouched", + existing: []ctrlclient.Object{userPDBAtSameName}, + expect: expectUserPDB, + }, + { + name: "dry-run does not create", + dryRun: true, + expect: expectNoPDB, + }, + } { + t.Run(tc.name, func(t *testing.T) { + cli := ctrlfake.NewClientBuilder().WithScheme(newCtrlScheme(t)).WithObjects(tc.existing...).Build() + + require.NoError(t, createTempPDB(t.Context(), cli, appController, tc.dryRun)) + + list := &policyv1.PodDisruptionBudgetList{} + require.NoError(t, cli.List(t.Context(), list)) + switch tc.expect { + case expectNoPDB: + assert.Empty(t, list.Items) + case expectOurs: + require.Len(t, list.Items, 1) + pdb := list.Items[0] + assert.Equal(t, pdbManagedByLabelValue, pdb.Labels[pdbManagedByLabelKey]) + assert.Equal(t, pdbTempLabelValue, pdb.Labels[pdbTempLabelKey]) + require.NotNil(t, pdb.Spec.MaxUnavailable) + assert.Equal(t, int32(1), pdb.Spec.MaxUnavailable.IntVal) + case expectUserPDB: + require.Len(t, list.Items, 1) + assert.NotEqual(t, pdbManagedByLabelValue, list.Items[0].Labels[pdbManagedByLabelKey]) + case expectAny: + require.Len(t, list.Items, 1) + } + + if tc.runCleanupAfter { + require.NoError(t, cleanupTempPDBs(t.Context(), cli, false)) + require.NoError(t, cli.List(t.Context(), list)) + assert.Empty(t, list.Items, "cleanup must remove pre-existing temp PDB even with no state from ensure") + } + }) + } +} + +// TestDiscoverControllers_FiltersByNodeSet locks in the cluster-wide-list + +// client-side filter shape of discoverControllers: pods on target nodes +// resolve to their top-level controllers, pods elsewhere are ignored, and +// duplicate controllers (multiple pods of one Deployment) collapse to a +// single entry. +func TestDiscoverControllers_FiltersByNodeSet(t *testing.T) { + mkDeployment := func(name, ns, appLabel string) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": appLabel}}, + }, + } + } + mkReplicaSet := func(name, ns, owningDeployment string) *appsv1.ReplicaSet { + return &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, Namespace: ns, + OwnerReferences: []metav1.OwnerReference{{Kind: "Deployment", Name: owningDeployment, Controller: ptr.To(true)}}, + }, + } + } + // mkBareReplicaSet is a ReplicaSet with no owning Deployment — its pods + // resolve to the ReplicaSet itself as the top-level controller. + mkBareReplicaSet := func(name, ns, appLabel string) *appsv1.ReplicaSet { + return &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": appLabel}}, + }, + } + } + mkStatefulSet := func(name, ns, appLabel string) *appsv1.StatefulSet { + return &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: appsv1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": appLabel}}, + }, + } + } + mkPod := func(name, ns, nodeName, ownerKind, ownerName string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, Namespace: ns, + OwnerReferences: []metav1.OwnerReference{{Kind: ownerKind, Name: ownerName, Controller: ptr.To(true)}}, + }, + Spec: corev1.PodSpec{NodeName: nodeName}, + } + } + + client := fake.NewClientset( + mkDeployment("target-app", "default", "target"), + mkReplicaSet("target-app-abc", "default", "target-app"), + // two pods of the same Deployment, both on the target node — must dedup. + mkPod("target-pod-1", "default", "target-node", "ReplicaSet", "target-app-abc"), + mkPod("target-pod-2", "default", "target-node", "ReplicaSet", "target-app-abc"), + // StatefulSet pod on the target node — resolves to the StatefulSet. + mkStatefulSet("target-sts", "default", "target-sts"), + mkPod("target-sts-0", "default", "target-node", "StatefulSet", "target-sts"), + // bare ReplicaSet (no owning Deployment) on the target node — resolves + // to the ReplicaSet itself. + mkBareReplicaSet("target-bare-rs", "default", "target-bare"), + mkPod("target-bare-rs-xyz", "default", "target-node", "ReplicaSet", "target-bare-rs"), + // DaemonSet-owned pod on the target node — the default switch arm + // returns no controller (DaemonSets get no temporary PDB). + mkPod("target-ds-pod", "default", "target-node", "DaemonSet", "target-ds"), + // orphan pod (no controller owner) on the target node — ignored. + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "target-orphan", Namespace: "default"}, + Spec: corev1.PodSpec{NodeName: "target-node"}, + }, + // pod on a non-target node, must be ignored. + mkDeployment("off-target-app", "default", "off"), + mkReplicaSet("off-target-app-def", "default", "off-target-app"), + mkPod("off-target-pod", "default", "other-node", "ReplicaSet", "off-target-app-def"), + ) + + controllers, err := discoverControllers(t.Context(), client, map[string]struct{}{"target-node": {}}) + require.NoError(t, err) + got := make(map[string]string, len(controllers)) // name -> kind + for _, c := range controllers { + got[c.Name] = c.Kind + } + assert.Equal(t, map[string]string{ + "target-app": "Deployment", + "target-sts": "StatefulSet", + "target-bare-rs": "ReplicaSet", + }, got) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/plan.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/plan.go new file mode 100644 index 0000000000..5c5e7418ce --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/plan.go @@ -0,0 +1,157 @@ +// Package evict implements the `kubectl datadog autoscaling cluster +// evict-legacy-nodes` subcommand. It drains workloads from nodes that are not +// managed by the Datadog Karpenter NodePools (cluster-autoscaler ASGs, EKS +// managed node groups, user-created Karpenter NodePools, standalone EC2 +// instances) and scales those node groups to zero so that future capacity is +// provisioned exclusively by the NodePools created by +// `kubectl datadog autoscaling cluster install`. +package evict + +import ( + "errors" + "fmt" + "maps" + "slices" + "strings" + + "github.com/samber/lo" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// Target identifies a single node-management entity to evict. The Entity field +// is the bucket name within `ClusterInfo.NodeManagement[Manager]` — an ASG +// name, an EKS managed node group name, a Karpenter NodePool name, or the +// empty string for the standalone bucket. +type Target struct { + Manager clusterinfo.NodeManager + Entity string + Nodes []string +} + +// ParseTargetSpec parses a CLI `--target` value into a Target with an empty +// Nodes slice (Nodes are filled in by BuildPlan from the cluster snapshot). +// +// Format: `/`, with the entity omitted for the standalone +// manager. Fargate and unknown are rejected here — their migration is out of +// scope of this command. +func ParseTargetSpec(raw string) (Target, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return Target{}, errors.New("--target value cannot be empty") + } + mgrStr, entity, hasSlash := strings.Cut(raw, "/") + mgr := clusterinfo.NodeManager(mgrStr) + switch mgr { + case clusterinfo.NodeManagerASG, + clusterinfo.NodeManagerEKSManagedNodeGroup, + clusterinfo.NodeManagerKarpenter: + if !hasSlash || entity == "" { + return Target{}, fmt.Errorf("--target=%s requires a name: %s/", mgrStr, mgrStr) + } + return Target{Manager: mgr, Entity: entity}, nil + case clusterinfo.NodeManagerStandalone: + if hasSlash && entity != "" { + return Target{}, fmt.Errorf("--target=standalone does not take a name (got %q)", entity) + } + return Target{Manager: mgr, Entity: ""}, nil + case clusterinfo.NodeManagerFargate: + return Target{}, errors.New("--target=fargate is not supported: manage Fargate profiles directly via AWS") + case clusterinfo.NodeManagerUnknown: + return Target{}, errors.New("--target=unknown is not supported: nodes with unknown providers cannot be migrated automatically") + default: + return Target{}, fmt.Errorf("--target=%s: unknown manager type (supported: asg, eksManagedNodeGroup, karpenter, standalone)", mgrStr) + } +} + +// BuildPlan turns user intent (--all or one or more --target flags) into the +// list of Targets the orchestrator will evict. Entries flagged ManagedByDatadog +// are filtered out (protected by definition); fargate and unknown managers are +// skipped in --all mode (their lifecycle is handled elsewhere or unknown). +// +// In targeted mode, user input errors are collected and returned together so +// the user can fix all mistakes in a single iteration. +func BuildPlan(info *clusterinfo.ClusterInfo, all bool, specs []Target) ([]Target, error) { + if info == nil { + return nil, errors.New("cluster info is nil") + } + if all { + return buildAllPlan(info), nil + } + return buildTargetedPlan(info, specs) +} + +func buildAllPlan(info *clusterinfo.ClusterInfo) []Target { + var targets []Target + for _, mgr := range []clusterinfo.NodeManager{ + clusterinfo.NodeManagerKarpenter, + clusterinfo.NodeManagerEKSManagedNodeGroup, + clusterinfo.NodeManagerASG, + clusterinfo.NodeManagerStandalone, + } { + bucket, ok := info.NodeManagement[mgr] + if !ok { + continue + } + targets = append(targets, lo.FilterMap(slices.Sorted(maps.Keys(bucket)), func(name string, _ int) (Target, bool) { + entry := bucket[name] + if entry.ManagedByDatadog { + return Target{}, false + } + return Target{ + Manager: mgr, + Entity: name, + Nodes: entry.Nodes, + }, true + })...) + } + return targets +} + +func buildTargetedPlan(info *clusterinfo.ClusterInfo, specs []Target) ([]Target, error) { + if len(specs) == 0 { + return nil, errors.New("at least one --target must be provided, or --all") + } + var ( + targets []Target + errs []error + ) + for _, t := range specs { + bucket, ok := info.NodeManagement[t.Manager] + if !ok { + errs = append(errs, fmt.Errorf("--target=%s/%s: no %s entities found in the cluster snapshot", t.Manager, t.Entity, t.Manager)) + continue + } + entry, ok := bucket[t.Entity] + if !ok { + errs = append(errs, fmt.Errorf("--target=%s/%s: entity not found in the cluster snapshot", t.Manager, t.Entity)) + continue + } + if entry.ManagedByDatadog { + errs = append(errs, fmt.Errorf("--target=%s/%s: this entity is managed by Datadog and cannot be evicted", t.Manager, t.Entity)) + continue + } + targets = append(targets, Target{ + Manager: t.Manager, + Entity: t.Entity, + Nodes: entry.Nodes, + }) + } + if len(errs) > 0 { + return nil, errors.Join(errs...) + } + return targets, nil +} + +// hasDatadogManagedNodePool reports whether the snapshot lists at least one +// Karpenter NodePool flagged as Datadog-managed. The orchestrator uses this as +// a pre-flight guard: if no destination NodePool exists, scaling legacy +// capacity to zero would leave the cluster with no working capacity. +func hasDatadogManagedNodePool(info *clusterinfo.ClusterInfo) bool { + for _, entry := range info.NodeManagement[clusterinfo.NodeManagerKarpenter] { + if entry.ManagedByDatadog { + return true + } + } + return false +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/plan_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/plan_test.go new file mode 100644 index 0000000000..8fe110258e --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/plan_test.go @@ -0,0 +1,218 @@ +package evict + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +func TestParseTargetSpec(t *testing.T) { + for _, tc := range []struct { + name string + input string + want Target + wantError string + }{ + {name: "asg", input: "asg/legacy-asg", want: Target{Manager: clusterinfo.NodeManagerASG, Entity: "legacy-asg"}}, + {name: "eks-mng", input: "eksManagedNodeGroup/standard-2", want: Target{Manager: clusterinfo.NodeManagerEKSManagedNodeGroup, Entity: "standard-2"}}, + {name: "karpenter", input: "karpenter/user-pool", want: Target{Manager: clusterinfo.NodeManagerKarpenter, Entity: "user-pool"}}, + {name: "standalone-no-slash", input: "standalone", want: Target{Manager: clusterinfo.NodeManagerStandalone, Entity: ""}}, + {name: "standalone-empty-after-slash", input: "standalone/", want: Target{Manager: clusterinfo.NodeManagerStandalone, Entity: ""}}, + {name: "leading-trailing-spaces", input: " asg/foo ", want: Target{Manager: clusterinfo.NodeManagerASG, Entity: "foo"}}, + + {name: "empty", input: "", wantError: "cannot be empty"}, + {name: "asg-no-name", input: "asg", wantError: "requires a name"}, + {name: "asg-empty-name", input: "asg/", wantError: "requires a name"}, + {name: "standalone-with-name", input: "standalone/foo", wantError: "does not take a name"}, + {name: "fargate-rejected", input: "fargate/profile", wantError: "not supported"}, + {name: "unknown-rejected", input: "unknown/x", wantError: "not supported"}, + {name: "garbage", input: "blob/x", wantError: "unknown manager"}, + } { + t.Run(tc.name, func(t *testing.T) { + got, err := ParseTargetSpec(tc.input) + if tc.wantError != "" { + require.Error(t, err) + assert.Contains(t, err.Error(), tc.wantError) + return + } + require.NoError(t, err) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestBuildPlan(t *testing.T) { + mixedInfo := &clusterinfo.ClusterInfo{ + NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{ + clusterinfo.NodeManagerASG: { + "legacy-asg-a": {Nodes: []string{"ip-1", "ip-2"}, ManagedByDatadog: false}, + "legacy-asg-b": {Nodes: []string{"ip-3"}, ManagedByDatadog: false}, + }, + clusterinfo.NodeManagerEKSManagedNodeGroup: { + "mng-1": {Nodes: []string{"ip-4"}, ManagedByDatadog: false}, + "datadog-owned": {Nodes: []string{"ip-5"}, ManagedByDatadog: true}, + }, + clusterinfo.NodeManagerKarpenter: { + "user-np": {Nodes: []string{"ip-6"}, ManagedByDatadog: false}, + "dd-np": {Nodes: []string{"ip-7"}, ManagedByDatadog: true}, + }, + clusterinfo.NodeManagerStandalone: { + "": {Nodes: []string{"ip-8"}, ManagedByDatadog: false}, + }, + clusterinfo.NodeManagerFargate: { + "profile-1": {Nodes: []string{"fargate-ip-9"}, ManagedByDatadog: false}, + }, + clusterinfo.NodeManagerUnknown: { + "": {Nodes: []string{"weird-1"}, ManagedByDatadog: false}, + }, + }, + } + asgOnlyInfo := &clusterinfo.ClusterInfo{ + NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{ + clusterinfo.NodeManagerASG: { + "my-asg": {Nodes: []string{"ip-1", "ip-2"}, ManagedByDatadog: false}, + }, + }, + } + ddASGInfo := &clusterinfo.ClusterInfo{ + NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{ + clusterinfo.NodeManagerASG: { + "dd-asg": {Nodes: []string{"ip-1"}, ManagedByDatadog: true}, + }, + }, + } + + for _, tc := range []struct { + name string + info *clusterinfo.ClusterInfo + all bool + targets []Target + + wantErr bool + wantErrContains string + // wantEntities, when non-nil, is the expected set of `/` + // strings in the returned plan. ElementsMatch is used (order + // irrelevant). nil disables the check. + wantEntities []string + // wantNodes maps entity name to its expected Nodes slice in the + // returned plan (only used in the targeted happy-path case). + wantNodes map[string][]string + }{ + { + // --all: Datadog-managed entries dropped; fargate + unknown + // skipped entirely. + name: "--all returns every non-Datadog entity", + info: mixedInfo, + all: true, + wantEntities: []string{ + "karpenter/user-np", + "eksManagedNodeGroup/mng-1", + "asg/legacy-asg-a", + "asg/legacy-asg-b", + "standalone/", + }, + }, + { + name: "targeted happy path", + info: asgOnlyInfo, + targets: []Target{{Manager: clusterinfo.NodeManagerASG, Entity: "my-asg"}}, + wantEntities: []string{"asg/my-asg"}, + wantNodes: map[string][]string{"my-asg": {"ip-1", "ip-2"}}, + }, + { + name: "missing entity errors", + info: ddASGInfo, + targets: []Target{{Manager: clusterinfo.NodeManagerASG, Entity: "ghost"}}, + wantErr: true, + wantErrContains: "not found", + }, + { + name: "datadog-managed entity is refused", + info: ddASGInfo, + targets: []Target{{Manager: clusterinfo.NodeManagerASG, Entity: "dd-asg"}}, + wantErr: true, + wantErrContains: "managed by Datadog", + }, + { + name: "missing manager bucket errors", + info: ddASGInfo, + targets: []Target{{Manager: clusterinfo.NodeManagerKarpenter, Entity: "x"}}, + wantErr: true, + wantErrContains: "no karpenter entities", + }, + { + name: "no targets and not --all errors", + info: ddASGInfo, + wantErr: true, + wantErrContains: "at least one --target", + }, + } { + t.Run(tc.name, func(t *testing.T) { + got, err := BuildPlan(tc.info, tc.all, tc.targets) + if tc.wantErr { + require.Error(t, err) + if tc.wantErrContains != "" { + assert.Contains(t, err.Error(), tc.wantErrContains) + } + return + } + require.NoError(t, err) + if tc.wantEntities != nil { + entities := make([]string, 0, len(got)) + for _, g := range got { + entities = append(entities, string(g.Manager)+"/"+g.Entity) + } + assert.ElementsMatch(t, tc.wantEntities, entities) + } + for entity, wantNodes := range tc.wantNodes { + found := false + for _, g := range got { + if g.Entity == entity { + assert.Equal(t, wantNodes, g.Nodes) + found = true + break + } + } + assert.True(t, found, "expected entity %q in plan", entity) + } + }) + } +} + +func TestHasDatadogManagedNodePool(t *testing.T) { + for _, tc := range []struct { + name string + info *clusterinfo.ClusterInfo + want bool + }{ + { + name: "no karpenter bucket", + info: &clusterinfo.ClusterInfo{NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{}}, + want: false, + }, + { + name: "only user NodePools", + info: &clusterinfo.ClusterInfo{NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{ + clusterinfo.NodeManagerKarpenter: {"user-np": {ManagedByDatadog: false}}, + }}, + want: false, + }, + { + name: "at least one Datadog NodePool", + info: &clusterinfo.ClusterInfo{NodeManagement: map[clusterinfo.NodeManager]map[string]clusterinfo.NodeManagerEntry{ + clusterinfo.NodeManagerKarpenter: { + "user-np": {ManagedByDatadog: false}, + "dd-np": {ManagedByDatadog: true}, + }, + }}, + want: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, hasDatadogManagedNodePool(tc.info)) + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight.go new file mode 100644 index 0000000000..1ae27ec17c --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight.go @@ -0,0 +1,79 @@ +package evict + +import ( + "context" + "fmt" + "log" + + "github.com/fatih/color" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/cli-runtime/pkg/genericclioptions" + "sigs.k8s.io/controller-runtime/pkg/client" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// ddNodePoolCreatedLabel is the label every Datadog autoscaling product (this +// CLI and the cluster agent) puts on the Karpenter NodePools it creates. Used +// here to identify "Datadog-side" NodePools when comparing weights against +// user-side ones. +const ddNodePoolCreatedLabel = "autoscaling.datadoghq.com/created" + +// runPreflightWarnings prints non-blocking warnings for situations that don't +// prevent eviction but may surprise the operator after the fact. Pure +// best-effort — failures here are logged but do not abort the run. +func runPreflightWarnings(ctx context.Context, streams genericclioptions.IOStreams, ctrlClient client.Client, targets []Target) { + warnKarpenterWeightConflicts(ctx, streams, ctrlClient, targets) +} + +// warnKarpenterWeightConflicts compares the spec.weight of each user-managed +// Karpenter NodePool we are about to drain against the max spec.weight among +// Datadog-managed NodePools. When a user NodePool has a weight >= the Datadog +// max, freshly evicted pods may be re-scheduled onto a new node from the +// same user NodePool, defeating the migration. +func warnKarpenterWeightConflicts(ctx context.Context, streams genericclioptions.IOStreams, ctrlClient client.Client, targets []Target) { + userNPNames := lo.FilterMap(targets, func(t Target, _ int) (string, bool) { + return t.Entity, t.Manager == clusterinfo.NodeManagerKarpenter + }) + if len(userNPNames) == 0 { + return + } + + list := &karpv1.NodePoolList{} + if err := ctrlClient.List(ctx, list); err != nil { + if meta.IsNoMatchError(err) { + return + } + log.Printf("Warning: failed to list NodePools for pre-flight weight check: %v", err) + return + } + + var ddMaxWeight int32 + weightByName := make(map[string]int32, len(list.Items)) + for i := range list.Items { + np := &list.Items[i] + var w int32 + if np.Spec.Weight != nil { + w = *np.Spec.Weight + } + weightByName[np.Name] = w + if np.Labels[ddNodePoolCreatedLabel] == "true" && w > ddMaxWeight { + ddMaxWeight = w + } + } + + for _, name := range userNPNames { + w, ok := weightByName[name] + if !ok { + continue + } + if w >= ddMaxWeight { + fmt.Fprintln(streams.ErrOut, color.YellowString( + "⚠ NodePool %q has spec.weight=%d ≥ Datadog NodePool max weight=%d; evicted pods may be re-scheduled onto a freshly provisioned node from the same user NodePool. Consider raising the Datadog NodePool weight before retrying.", + name, w, ddMaxWeight, + )) + } + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight_test.go new file mode 100644 index 0000000000..6d814179d1 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/preflight_test.go @@ -0,0 +1,130 @@ +package evict + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +func newKarpenterScheme(t *testing.T) *runtime.Scheme { + t.Helper() + sch := runtime.NewScheme() + require.NoError(t, scheme.AddToScheme(sch)) + gv := schema.GroupVersion{Group: "karpenter.sh", Version: "v1"} + sch.AddKnownTypes(gv, &karpv1.NodePool{}, &karpv1.NodePoolList{}) + metav1.AddToGroupVersion(sch, gv) + return sch +} + +func mkNodePool(name string, weight *int32, datadogManaged bool) *karpv1.NodePool { + labels := map[string]string{} + if datadogManaged { + labels[ddNodePoolCreatedLabel] = "true" + } + return &karpv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{Name: name, Labels: labels}, + Spec: karpv1.NodePoolSpec{Weight: weight}, + } +} + +func TestWarnKarpenterWeightConflicts(t *testing.T) { + for _, tc := range []struct { + name string + nodePools []ctrlclient.Object + targets []Target + // wantContains lists substrings that MUST appear in the stderr output. + wantContains []string + // wantWarnEmpty asserts the stderr output is empty (no warning fired). + wantWarnEmpty bool + }{ + { + // No Karpenter target ⇒ nothing to warn about. + name: "no karpenter targets ⇒ no warning", + targets: []Target{ + {Manager: clusterinfo.NodeManagerASG, Entity: "asg-1"}, + {Manager: clusterinfo.NodeManagerStandalone, Entity: ""}, + }, + wantWarnEmpty: true, + }, + { + // User NP weight < Datadog NP weight ⇒ no conflict, no warning. + name: "user weight below Datadog weight", + nodePools: []ctrlclient.Object{ + mkNodePool("dd-np", ptr.To(int32(100)), true), + mkNodePool("user-np", ptr.To(int32(50)), false), + }, + targets: []Target{{Manager: clusterinfo.NodeManagerKarpenter, Entity: "user-np"}}, + wantWarnEmpty: true, + }, + { + // User NP weight > Datadog NP weight ⇒ warns (`>=` check). + name: "user weight above Datadog weight warns", + nodePools: []ctrlclient.Object{ + mkNodePool("dd-np", ptr.To(int32(10)), true), + mkNodePool("user-np", ptr.To(int32(50)), false), + }, + targets: []Target{{Manager: clusterinfo.NodeManagerKarpenter, Entity: "user-np"}}, + wantContains: []string{"user-np", "weight=50"}, + }, + { + // Both nil ⇒ both default to 0 ⇒ equal-weight conflict warns. + name: "nil weights both default to 0 (equal-weight conflict)", + nodePools: []ctrlclient.Object{ + mkNodePool("dd-np", nil, true), + mkNodePool("user-np", nil, false), + }, + targets: []Target{{Manager: clusterinfo.NodeManagerKarpenter, Entity: "user-np"}}, + wantContains: []string{"user-np"}, + }, + { + // Target name doesn't match any NodePool in the cluster ⇒ + // nothing to compare against, no panic. + name: "target not in cluster is ignored", + nodePools: []ctrlclient.Object{ + mkNodePool("dd-np", ptr.To(int32(100)), true), + }, + targets: []Target{{Manager: clusterinfo.NodeManagerKarpenter, Entity: "ghost-np"}}, + wantWarnEmpty: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + cli := ctrlfake.NewClientBuilder(). + WithScheme(newKarpenterScheme(t)). + WithObjects(tc.nodePools...). + Build() + streams, _, _, errBuf := genericclioptions.NewTestIOStreams() + + warnKarpenterWeightConflicts(t.Context(), streams, cli, tc.targets) + + out := errBuf.String() + if tc.wantWarnEmpty { + assert.Empty(t, out) + } + for _, s := range tc.wantContains { + assert.Contains(t, out, s) + } + }) + } +} + +// TestRunPreflightWarnings_NoOp keeps the thin `runPreflightWarnings` wrapper +// exercised separately so coverage doesn't regress as new pre-flight checks +// are added. +func TestRunPreflightWarnings_NoOp(t *testing.T) { + cli := ctrlfake.NewClientBuilder().WithScheme(newKarpenterScheme(t)).Build() + streams, _, _, errBuf := genericclioptions.NewTestIOStreams() + runPreflightWarnings(t.Context(), streams, cli, nil) + assert.Empty(t, errBuf.String()) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt.go new file mode 100644 index 0000000000..f467f3ad60 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt.go @@ -0,0 +1,65 @@ +package evict + +import ( + "fmt" + "maps" + "slices" + "strings" + + "github.com/fatih/color" + "github.com/samber/lo" + "k8s.io/cli-runtime/pkg/genericclioptions" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// printPlan writes a human-readable description of the upcoming actions — +// cluster-autoscaler scale-down, PDB creation, per-target evictions — to +// streams.Out. Used both at confirmation time and in --dry-run mode. +func printPlan(streams genericclioptions.IOStreams, info *clusterinfo.ClusterInfo, targets []Target, scaleCA, ensurePDBs bool) { + fmt.Fprintln(streams.Out, "\nThe following actions will be performed:") + if scaleCA && info != nil && info.Autoscaling.ClusterAutoscaler.Present { + fmt.Fprintf(streams.Out, " • Scale cluster-autoscaler Deployment %s/%s to 0 replicas\n", + info.Autoscaling.ClusterAutoscaler.Namespace, info.Autoscaling.ClusterAutoscaler.Name) + } + if ensurePDBs { + fmt.Fprintln(streams.Out, " • Create temporary PodDisruptionBudgets (maxUnavailable: 1) for workloads without one") + } + + grouped := lo.GroupBy(targets, func(t Target) clusterinfo.NodeManager { return t.Manager }) + // Sort the manager keys for stable, deterministic output. + for _, mgr := range slices.Sorted(maps.Keys(grouped)) { + ts := grouped[mgr] + fmt.Fprintf(streams.Out, " • Evict %s entities:\n", mgr) + for _, t := range ts { + label := t.Entity + if label == "" { + label = "(all)" + } + fmt.Fprintf(streams.Out, " ◦ %s (%d node(s))\n", label, len(t.Nodes)) + } + } + if ensurePDBs { + fmt.Fprintln(streams.Out, " • Remove the temporary PodDisruptionBudgets created above") + } + fmt.Fprintln(streams.Out, color.YellowString("\n⚠ This will drain workloads and terminate the underlying instances of non-Datadog node groups.")) + fmt.Fprintln(streams.Out, color.YellowString(" Verify the Datadog Karpenter NodePool has enough capacity headroom for the migrated pods.")) +} + +// promptConfirmation reads a y/N answer from streams.In after the plan has +// been printed. Returns true when the user confirmed (typed `y` or `yes`). +// Read errors or any other input mean "declined" — unattended runs should +// pass --yes to skip the prompt entirely. +func promptConfirmation(streams genericclioptions.IOStreams) bool { + fmt.Fprint(streams.Out, "\nContinue? (y/N): ") + var response string + // Fscanln may return "unexpected newline" when the user just presses + // Enter; that's treated as decline below. + _, _ = fmt.Fscanln(streams.In, &response) + response = strings.ToLower(strings.TrimSpace(response)) + confirmed := response == "y" || response == "yes" + if !confirmed { + fmt.Fprintln(streams.Out, "Eviction cancelled.") + } + return confirmed +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt_test.go new file mode 100644 index 0000000000..8e45fb346d --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/prompt_test.go @@ -0,0 +1,135 @@ +package evict + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/cli-runtime/pkg/genericclioptions" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +func TestPrintPlan(t *testing.T) { + caPresent := clusterinfo.Autoscaling{ + ClusterAutoscaler: clusterinfo.ClusterAutoscaler{Present: true, Namespace: "kube-system", Name: "cluster-autoscaler"}, + } + caAbsent := clusterinfo.Autoscaling{ClusterAutoscaler: clusterinfo.ClusterAutoscaler{Present: false}} + fullTargets := []Target{ + {Manager: clusterinfo.NodeManagerASG, Entity: "legacy-asg", Nodes: []string{"ip-1", "ip-2"}}, + {Manager: clusterinfo.NodeManagerKarpenter, Entity: "user-np", Nodes: []string{"ip-3"}}, + {Manager: clusterinfo.NodeManagerStandalone, Entity: "", Nodes: []string{"ip-4"}}, + } + + for _, tc := range []struct { + name string + // info, when non-nil, becomes the input. Use a sentinel "nil" via + // nilInfo=true to test the nil-tolerant path. + info *clusterinfo.ClusterInfo + nilInfo bool + targets []Target + scaleCA bool + ensurePDBs bool + + wantContains []string + wantNotContains []string + }{ + { + name: "all sections rendered", + info: &clusterinfo.ClusterInfo{Autoscaling: caPresent}, + targets: fullTargets, + scaleCA: true, + ensurePDBs: true, + wantContains: []string{ + "cluster-autoscaler Deployment kube-system/cluster-autoscaler", + "Create temporary PodDisruptionBudgets", + "Remove the temporary PodDisruptionBudgets", + "Evict asg entities", + "legacy-asg (2 node(s))", + "Evict karpenter entities", + "user-np (1 node(s))", + "Evict standalone entities", + // Standalone entry with empty name renders as "(all)". + "(all) (1 node(s))", + }, + }, + { + // scaleCA=false hides the CA bullet even when CA is Present; + // ensurePDBs=false hides the PDB bullets. + name: "scaleCA off and ensurePDBs off hide their bullets", + info: &clusterinfo.ClusterInfo{Autoscaling: caPresent}, + targets: []Target{{Manager: clusterinfo.NodeManagerASG, Entity: "x"}}, + scaleCA: false, + ensurePDBs: false, + wantNotContains: []string{"Scale cluster-autoscaler", "PodDisruptionBudgets"}, + }, + { + // CA not Present ⇒ no bullet regardless of scaleCA. + name: "CA absent hides the bullet", + info: &clusterinfo.ClusterInfo{Autoscaling: caAbsent}, + targets: nil, + scaleCA: true, + ensurePDBs: false, + wantNotContains: []string{"Scale cluster-autoscaler"}, + }, + { + // Nil info ⇒ no CA bullet but the PDB bullets and warning + // footer still appear. + name: "nil info tolerated", + nilInfo: true, + targets: nil, + scaleCA: true, + ensurePDBs: true, + wantContains: []string{"Create temporary PodDisruptionBudgets"}, + wantNotContains: []string{"Scale cluster-autoscaler"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + streams, _, outBuf, _ := genericclioptions.NewTestIOStreams() + info := tc.info + if tc.nilInfo { + info = nil + } + printPlan(streams, info, tc.targets, tc.scaleCA, tc.ensurePDBs) + out := outBuf.String() + for _, s := range tc.wantContains { + assert.Contains(t, out, s) + } + for _, s := range tc.wantNotContains { + assert.NotContains(t, out, s) + } + }) + } +} + +func TestPromptConfirmation(t *testing.T) { + for _, tc := range []struct { + name string + input string + wantOK bool + wantCancelMessage bool + }{ + {name: "y", input: "y", wantOK: true}, + {name: "Y", input: "Y", wantOK: true}, + {name: "yes", input: "yes", wantOK: true}, + {name: "YES", input: "YES", wantOK: true}, + {name: "Yes-with-newline", input: "Yes\n", wantOK: true}, + + {name: "n", input: "n", wantOK: false, wantCancelMessage: true}, + {name: "N", input: "N", wantOK: false, wantCancelMessage: true}, + {name: "no", input: "no", wantOK: false, wantCancelMessage: true}, + {name: "empty", input: "", wantOK: false, wantCancelMessage: true}, + {name: "garbage", input: "blah", wantOK: false, wantCancelMessage: true}, + {name: "digit", input: "1", wantOK: false, wantCancelMessage: true}, + } { + t.Run(tc.name, func(t *testing.T) { + streams, stdin, outBuf, _ := genericclioptions.NewTestIOStreams() + stdin.WriteString(tc.input + "\n") + assert.Equal(t, tc.wantOK, promptConfirmation(streams)) + if tc.wantCancelMessage { + out := outBuf.String() + assert.True(t, strings.Contains(out, "cancelled") || strings.Contains(out, "Cancelled")) + } + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/run.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/run.go new file mode 100644 index 0000000000..f18d6d3fb4 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/run.go @@ -0,0 +1,275 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + "time" + + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/karpenter" +) + +// RunOptions captures every parameter callers (the evict cobra command) feed +// into Run. Built once by the command's run() method. +type RunOptions struct { + ClusterName string + KarpenterNamespace string // override; auto-detected when empty + All bool + Targets []Target // parsed --target=/ + SkipCA bool + EnsurePDBs bool + DryRun bool + Yes bool + EvictionTimeout time.Duration + NodeTimeout time.Duration + Debug bool +} + +// Run is the orchestrator entry point. Idempotent: a second invocation on a +// cluster already migrated returns success without changes. Crash-safe: a +// killed run leaves no permanently corrupting state — re-running cleans up. +func Run(ctx context.Context, streams genericclioptions.IOStreams, configFlags *genericclioptions.ConfigFlags, clientset *kubernetes.Clientset, opts RunOptions) error { + log.SetOutput(streams.ErrOut) + ctrl.SetLogger(zap.New(zap.UseDevMode(false), zap.WriteTo(streams.ErrOut))) + + cli, err := clients.Build(ctx, configFlags, clientset) + if err != nil { + return fmt.Errorf("failed to build clients: %w", err) + } + if err = clients.ValidateAWSAccountConsistency(ctx, cli, opts.ClusterName, configFlags); err != nil { + return err + } + k, err := karpenter.FindInstallation(ctx, clientset) + if err != nil { + return fmt.Errorf("failed to check for an existing Karpenter installation: %w", err) + } + if k == nil { + return errors.New("Karpenter is not installed on this cluster; run `kubectl datadog autoscaling cluster install` first") + } + namespace := opts.KarpenterNamespace + if namespace == "" { + namespace = k.Namespace + } + + display.PrintBox(streams.Out, "Evicting legacy nodes from cluster "+opts.ClusterName+".") + + info, err := classify(ctx, clientset, cli, opts.ClusterName) + if err != nil { + return fmt.Errorf("failed to classify cluster nodes: %w", err) + } + if info.Autoscaling.EKSAutoMode.Enabled { + return errors.New("EKS auto-mode is enabled on this cluster; eviction is not applicable (auto-mode manages its own node lifecycle)") + } + // Skip the ConfigMap write on dry-run: a preview must not mutate the + // cluster (and must not require write RBAC on the Karpenter namespace). + if opts.DryRun { + log.Printf("[dry-run] would persist cluster-info to ConfigMap %s/%s", namespace, clusterinfo.ConfigMapName) + } else if err = clusterinfo.Persist(ctx, cli.K8sClient, namespace, info); err != nil { + log.Printf("Warning: failed to persist updated cluster-info ConfigMap: %v", err) + } + + // Before any destructive work, verify there is at least one Datadog-managed + // NodePool to receive the evicted pods. Without this guard, scaling legacy + // capacity to 0 would leave the cluster with no working capacity (e.g. when + // `install` was run with --create-karpenter-resources=none, or the user + // deleted the Datadog NodePools out of band). + if !hasDatadogManagedNodePool(info) { + return errors.New("no Datadog-managed Karpenter NodePool found in the cluster snapshot; legacy nodes would have no destination to migrate to. Run `kubectl datadog autoscaling cluster install` (or `update`) to create the Datadog NodePool first") + } + + targets, err := BuildPlan(info, opts.All, opts.Targets) + if err != nil { + return err + } + if len(targets) == 0 { + // A previous run may have been interrupted (e.g. an EKS managed node + // group timed out, leaving its temporary PDBs in place for a later + // rerun). By the time that rerun finds nothing left to evict, those + // leaked PDBs would otherwise persist indefinitely with + // maxUnavailable: 1 and throttle future rollouts/disruptions, so + // reclaim anything left behind before the no-op exit. + reclaimLeakedTempPDBs(ctx, cli.K8sClient, opts.EnsurePDBs, opts.DryRun) + display.PrintBox(streams.Out, "Nothing to evict — the cluster is already on Datadog-managed Karpenter NodePools.") + return nil + } + + runPreflightWarnings(ctx, streams, cli.K8sClient, targets) + printPlan(streams, info, targets, !opts.SkipCA, opts.EnsurePDBs) + + if !opts.Yes && !opts.DryRun { + if !promptConfirmation(streams) { + return nil + } + } + + // Step 1: cluster-autoscaler scale-down — first, so it does not undo our + // work by provisioning new legacy nodes while we drain. + if !opts.SkipCA { + if err := scaleDownClusterAutoscaler(ctx, clientset, info.Autoscaling.ClusterAutoscaler, opts.DryRun); err != nil { + return fmt.Errorf("failed to scale down cluster-autoscaler: %w", err) + } + } + + // Step 2: ensure temporary PDBs. On error, attempt cleanup of whatever + // landed before the failure — the temp PDBs carry our labels so the + // label-based cleanup picks them up. On a SIGINT mid-flight the temp + // PDBs may leak; a subsequent run cleans them up via the same label. + if opts.EnsurePDBs { + if err := ensureTempPDBs(ctx, clientset, cli.K8sClient, targets, opts.DryRun); err != nil { + if cleanupErr := cleanupTempPDBs(ctx, cli.K8sClient, opts.DryRun); cleanupErr != nil { + log.Printf("Warning: failed to cleanup partial temporary PDBs: %v", cleanupErr) + } + return fmt.Errorf("failed to ensure temporary PDBs: %w", err) + } + } + + drainOpts := nodeDrainOptions{ + DryRun: opts.DryRun, + EvictionTimeout: opts.EvictionTimeout, + NodeTimeout: opts.NodeTimeout, + PollInterval: 2 * time.Second, + } + evictor := func(c context.Context, t Target, d nodeDrainOptions) error { + return evictTarget(c, clientset, cli, opts.ClusterName, t, d) + } + result := evictAllTargets(ctx, targets, drainOpts, evictor) + errs := result.Errors + + // Step 4: cleanup temporary PDBs explicitly here — keeping it inline + // (rather than deferred) means the "Deleted temporary PDB …" log lines + // appear BEFORE the final success/failure summary, so the user does not + // experience an apparent hang after seeing the "✅" box. + // + // When at least one EKS managed node group did not finish draining + // within the timeout, we skip the cleanup: EKS may still be evicting + // pods on those nodes, and dropping the temp PDBs now would leave + // cross-type workloads unprotected mid-drain. The label-based cleanup + // on a subsequent run picks the PDBs up once EKS converges. + switch { + case !opts.EnsurePDBs: + case result.EKSDrainIncomplete: + log.Printf("Note: at least one EKS managed node group did not finish draining within --node-timeout; temporary PDBs left in place so EKS can finish safely. Re-run the command once `aws eks describe-nodegroup` reports the group at 0 nodes — the next run will clean them up.") + default: + if err := cleanupTempPDBs(ctx, cli.K8sClient, opts.DryRun); err != nil { + log.Printf("Warning: failed to cleanup temporary PDBs: %v", err) + } + } + + // Re-classify and persist to reflect the final state. Skipped on dry-run + // for the same reason as the pre-eviction Persist above. + if opts.DryRun { + log.Printf("[dry-run] would re-classify and persist post-eviction cluster-info to ConfigMap %s/%s", namespace, clusterinfo.ConfigMapName) + } else if final, classifyErr := classify(ctx, clientset, cli, opts.ClusterName); classifyErr == nil { + if err := clusterinfo.Persist(ctx, cli.K8sClient, namespace, final); err != nil { + log.Printf("Warning: failed to persist post-eviction cluster-info: %v", err) + } + } else { + log.Printf("Warning: failed to re-classify post-eviction: %v", classifyErr) + } + + if len(errs) > 0 { + return fmt.Errorf("eviction completed with %d error(s):\n%w", len(errs), errors.Join(errs...)) + } + display.PrintBox(streams.Out, "✅ Legacy nodes drained from cluster "+opts.ClusterName+".") + return nil +} + +// reclaimLeakedTempPDBs runs the idempotent, label-based temp-PDB cleanup when +// PDB management is enabled. It is invoked on the no-target path so that +// temporary PDBs left behind by a prior interrupted run (e.g. an EKS managed +// node group that timed out) are reclaimed once the cluster has fully migrated +// and there is nothing left to evict. A no-op when ensurePDBs is false. +func reclaimLeakedTempPDBs(ctx context.Context, ctrlClient client.Client, ensurePDBs, dryRun bool) { + if !ensurePDBs { + return + } + if err := cleanupTempPDBs(ctx, ctrlClient, dryRun); err != nil { + log.Printf("Warning: failed to cleanup leftover temporary PDBs: %v", err) + } +} + +// classify wraps clusterinfo.Classify with the call-site boilerplate shared by +// the pre- and post-eviction snapshots. +func classify(ctx context.Context, clientset *kubernetes.Clientset, cli *clients.Clients, clusterName string) (*clusterinfo.ClusterInfo, error) { + return clusterinfo.Classify(ctx, clusterinfo.ClassifyInput{ + K8sClient: clientset, + CtrlClient: cli.K8sClient, + Autoscaling: cli.Autoscaling, + EKS: cli.EKS, + Discovery: clientset.Discovery(), + ClusterName: clusterName, + }) +} + +// evictResult is the structured outcome of evictAllTargets. Errors is the +// aggregated per-target error list (empty when everything succeeded). +// EKSDrainIncomplete is true when at least one EKS managed node group target +// returned an error WRAPPING errEKSDrainIncomplete — the EKS drain may still +// be in progress, in which case the caller must NOT cleanup the temporary +// PDBs (EKS would then over-disrupt the still-running workloads). +type evictResult struct { + Errors []error + EKSDrainIncomplete bool +} + +// targetEvictor is the closure injected into evictAllTargets by Run. Pulling +// the dispatch out as an interface-like function makes the orchestrator +// unit-testable without faking the entire clients.Clients struct. +type targetEvictor func(ctx context.Context, t Target, drainOpts nodeDrainOptions) error + +// evictAllTargets processes targets sequentially, in the order BuildPlan +// produced. Errors are accumulated; a failing target does not abort the +// others — every issue surfaces at the end so a partial migration is fully +// visible. Sequential execution keeps logs linear, bounds the eviction +// pressure on the apiserver, and matches the synchronous per-target shape +// (every evictor — including EKS MNG — already blocks until its drain +// completes). +func evictAllTargets(ctx context.Context, targets []Target, drainOpts nodeDrainOptions, evictor targetEvictor) evictResult { + var ( + errs []error + eksDrainIncomplete bool + ) + for _, t := range targets { + err := evictor(ctx, t, drainOpts) + if err == nil { + continue + } + errs = append(errs, fmt.Errorf("%s/%s: %w", t.Manager, t.Entity, err)) + // Only treat the drain as "still in progress" when EKS accepted the + // scaling change but observation of the drain failed or timed out. + // A failed UpdateNodegroupConfig means EKS never started draining, + // so cleanup is safe. + if errors.Is(err, errEKSDrainIncomplete) { + eksDrainIncomplete = true + } + } + return evictResult{Errors: errs, EKSDrainIncomplete: eksDrainIncomplete} +} + +// evictTarget dispatches a single Target to the manager-specific evictor. +// Wrapped into a closure inside Run so evictAllTargets can be exercised with +// a fake evictor in tests. +func evictTarget(ctx context.Context, clientset kubernetes.Interface, cli *clients.Clients, clusterName string, t Target, drainOpts nodeDrainOptions) error { + switch t.Manager { + case clusterinfo.NodeManagerASG: + return evictASG(ctx, clientset, cli.Autoscaling, t.Entity, t.Nodes, drainOpts) + case clusterinfo.NodeManagerEKSManagedNodeGroup: + return evictEKSManagedNodeGroup(ctx, cli.EKS, clientset, clusterName, t.Entity, drainOpts) + case clusterinfo.NodeManagerKarpenter: + return evictKarpenterUserNodePool(ctx, clientset, t.Entity, t.Nodes, drainOpts) + case clusterinfo.NodeManagerStandalone: + return evictStandalone(ctx, clientset, cli.EC2, t.Nodes, drainOpts) + default: + return fmt.Errorf("unsupported manager %q", t.Manager) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/run_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/run_test.go new file mode 100644 index 0000000000..af7f20577f --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/run_test.go @@ -0,0 +1,179 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" +) + +// TestReclaimLeakedTempPDBs covers the no-target recovery path: a temp PDB left +// behind by a prior interrupted run is reclaimed only when PDB management is +// enabled and not in dry-run. +func TestReclaimLeakedTempPDBs(t *testing.T) { + leakedTempPDB := func() *policyv1.PodDisruptionBudget { + return &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: "leaked", Namespace: "default", + Labels: map[string]string{ + pdbManagedByLabelKey: pdbManagedByLabelValue, + pdbTempLabelKey: pdbTempLabelValue, + }, + }, + } + } + + for _, tc := range []struct { + name string + ensurePDBs bool + dryRun bool + wantDeleted bool + }{ + {name: "enabled deletes leaked temp PDB", ensurePDBs: true, wantDeleted: true}, + {name: "disabled is a no-op", ensurePDBs: false, wantDeleted: false}, + {name: "dry-run does not delete", ensurePDBs: true, dryRun: true, wantDeleted: false}, + } { + t.Run(tc.name, func(t *testing.T) { + cli := ctrlfake.NewClientBuilder().WithScheme(newCtrlScheme(t)).WithObjects(leakedTempPDB()).Build() + + reclaimLeakedTempPDBs(t.Context(), cli, tc.ensurePDBs, tc.dryRun) + + list := &policyv1.PodDisruptionBudgetList{} + require.NoError(t, cli.List(t.Context(), list)) + if tc.wantDeleted { + assert.Empty(t, list.Items) + } else { + require.Len(t, list.Items, 1) + assert.Equal(t, "leaked", list.Items[0].Name) + } + }) + } + + // A cleanup error must be swallowed (logged, not propagated): the no-op + // exit must still succeed even if reclaiming the leaked PDBs fails. + t.Run("cleanup error is swallowed", func(t *testing.T) { + cli := ctrlfake.NewClientBuilder(). + WithScheme(newCtrlScheme(t)). + WithObjects(leakedTempPDB()). + WithInterceptorFuncs(interceptor.Funcs{ + List: func(context.Context, ctrlclient.WithWatch, ctrlclient.ObjectList, ...ctrlclient.ListOption) error { + return errors.New("boom") + }, + }). + Build() + + assert.NotPanics(t, func() { + reclaimLeakedTempPDBs(t.Context(), cli, true, false) + }) + }) +} + +func TestEvictAllTargets(t *testing.T) { + asgTarget := Target{Manager: clusterinfo.NodeManagerASG, Entity: "asg-1"} + karpenterTarget := Target{Manager: clusterinfo.NodeManagerKarpenter, Entity: "np-1"} + eksTarget := Target{Manager: clusterinfo.NodeManagerEKSManagedNodeGroup, Entity: "mng-1"} + standaloneTarget := Target{Manager: clusterinfo.NodeManagerStandalone, Entity: ""} + + for _, tc := range []struct { + name string + targets []Target + evictor targetEvictor + wantErrors int + wantEKSIncomplete bool + wantErrIsSentinel bool + // expectInvoked, when non-nil, lists managers that MUST have had + // their evictor called (verifies that one failing target does not + // abort the others). + expectInvoked []clusterinfo.NodeManager + }{ + { + name: "happy path", + targets: []Target{asgTarget, karpenterTarget}, + evictor: func(_ context.Context, _ Target, _ nodeDrainOptions) error { return nil }, + }, + { + // Safety-critical signal: an EKS MNG target returning + // errEKSDrainIncomplete (wrapped) must flag the result so Run + // skips the temp-PDB cleanup. + name: "EKS sentinel propagates", + targets: []Target{eksTarget, asgTarget}, + evictor: func(_ context.Context, t Target, _ nodeDrainOptions) error { + if t.Manager == clusterinfo.NodeManagerEKSManagedNodeGroup { + return fmt.Errorf("simulated timeout: %w", errEKSDrainIncomplete) + } + return nil + }, + wantErrors: 1, + wantEKSIncomplete: true, + wantErrIsSentinel: true, + }, + { + // An EKS failure that is NOT a wait timeout (e.g. an + // UpdateNodegroupConfig API rejection) means EKS never started + // draining, so temp-PDB cleanup is safe ⇒ flag stays false. + name: "non-sentinel EKS error does not flag", + targets: []Target{eksTarget}, + evictor: func(_ context.Context, _ Target, _ nodeDrainOptions) error { + return errors.New("invalid scaling config") + }, + wantErrors: 1, + }, + { + // An ASG drain failure has nothing to do with EKS — must NOT + // flag. + name: "non-EKS error does not flag", + targets: []Target{asgTarget}, + evictor: func(_ context.Context, _ Target, _ nodeDrainOptions) error { + return errors.New("ASG drain failed") + }, + wantErrors: 1, + }, + { + // One failing target must not abort the rest — the loop keeps + // going and every other manager is still invoked. + name: "failing target does not abort the others", + targets: []Target{asgTarget, karpenterTarget, standaloneTarget}, + evictor: nil, // installed inline below to capture invocations. + wantErrors: 1, + expectInvoked: []clusterinfo.NodeManager{ + clusterinfo.NodeManagerASG, + clusterinfo.NodeManagerKarpenter, + clusterinfo.NodeManagerStandalone, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + invoked := map[clusterinfo.NodeManager]int{} + evictor := tc.evictor + if evictor == nil { + evictor = func(_ context.Context, target Target, _ nodeDrainOptions) error { + invoked[target.Manager]++ + if target.Manager == clusterinfo.NodeManagerASG { + return errors.New("boom") + } + return nil + } + } + + result := evictAllTargets(t.Context(), tc.targets, nodeDrainOptions{}, evictor) + require.Len(t, result.Errors, tc.wantErrors) + assert.Equal(t, tc.wantEKSIncomplete, result.EKSDrainIncomplete) + if tc.wantErrIsSentinel { + assert.ErrorIs(t, result.Errors[0], errEKSDrainIncomplete) + } + for _, mgr := range tc.expectInvoked { + assert.NotZero(t, invoked[mgr], "expected %s evictor to be invoked", mgr) + } + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone.go new file mode 100644 index 0000000000..499de4fd96 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone.go @@ -0,0 +1,66 @@ +package evict + +import ( + "context" + "errors" + "fmt" + "log" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + "k8s.io/client-go/kubernetes" + + commonaws "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/aws" +) + +// EC2API is the subset of *ec2.Client used by evictStandalone. Defined as an +// interface so unit tests can stub the AWS SDK out cheaply. +type EC2API interface { + TerminateInstances(ctx context.Context, in *ec2.TerminateInstancesInput, opts ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) +} + +// evictStandalone cordons and drains every standalone EC2 instance (one whose +// Kubernetes Node has an `aws:////i-` providerID but is not in any +// ASG, EKS managed node group, or Karpenter NodePool). Every node is cordoned +// up front (so pods evicted from one node are never rescheduled onto another +// node of the group that is itself about to be drained), then each instance is +// terminated as soon as its node has drained cleanly so the IaaS capacity is +// freed without waiting for the rest of the group. Terminating standalone +// instances directly is safe: by definition they have no controller that would +// relaunch them. +// +// Safety rule: an EC2 instance is only terminated when its node was fully +// drained. If draining failed (PDB-blocked, timeout, etc.), we leave the +// underlying instance intact — terminating it would kill the still-running +// pods. A re-run picks up where this one stopped. +func evictStandalone(ctx context.Context, clientset kubernetes.Interface, ec2API EC2API, nodes []string, drainOpts nodeDrainOptions) error { + cordoned, errs := cordonNodes(ctx, clientset, nodes, drainOpts.DryRun) + + for _, node := range cordoned { + nodeName := node.Name + id, hasInstanceID := commonaws.ExtractEC2InstanceID(node) + if !hasInstanceID { + log.Printf("Warning: node %s has unexpected providerID %q; cannot terminate the underlying instance", nodeName, node.Spec.ProviderID) + } + if err := drainNode(ctx, clientset, nodeName, drainOpts); err != nil { + errs = append(errs, fmt.Errorf("drain node %s: %w", nodeName, err)) + continue // do NOT terminate this instance: workloads are still on it + } + // The node drained cleanly: terminate its instance right away so its + // capacity is freed without waiting for the rest of the group. + if !hasInstanceID { + continue + } + if drainOpts.DryRun { + log.Printf("[dry-run] would terminate standalone instance %s", id) + continue + } + if _, err := ec2API.TerminateInstances(ctx, &ec2.TerminateInstancesInput{ + InstanceIds: []string{id}, + }); err != nil { + errs = append(errs, fmt.Errorf("terminate instance %s: %w", id, err)) + } else { + log.Printf("Terminated standalone EC2 instance %s.", id) + } + } + return errors.Join(errs...) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone_test.go b/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone_test.go new file mode 100644 index 0000000000..49695d29b2 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/evict/standalone_test.go @@ -0,0 +1,198 @@ +package evict + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +type stubEC2 struct { + terminated []string + err error +} + +func (s *stubEC2) TerminateInstances(_ context.Context, in *ec2.TerminateInstancesInput, _ ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) { + s.terminated = append(s.terminated, in.InstanceIds...) + return &ec2.TerminateInstancesOutput{}, s.err +} + +func TestEvictStandalone(t *testing.T) { + ec2Node := func(name, az, id string) *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: corev1.NodeSpec{ProviderID: "aws:///" + az + "/" + id}, + } + } + stuckPod := func() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "blocker", Namespace: "default"}, + Spec: corev1.PodSpec{NodeName: "ip-stuck"}, + } + } + gceNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "ip-1"}, + Spec: corev1.NodeSpec{ProviderID: "gce:///zone/instance"}, + } + fastDrain := nodeDrainOptions{ + EvictionTimeout: 50 * time.Millisecond, + NodeTimeout: 100 * time.Millisecond, + PollInterval: 10 * time.Millisecond, + } + + for _, tc := range []struct { + name string + objects []runtime.Object + installEvictionReactor bool + nodes []string + opts nodeDrainOptions + // stubErr, when set, makes every TerminateInstances call fail. + stubErr error + wantErr bool + // wantTerminated is the expected set of instance IDs in + // stubEC2.terminated. nil ⇒ TerminateInstances must not be called. + wantTerminated []string + // wantUnschedulable: per-node expected `Spec.Unschedulable`. + wantUnschedulable map[string]bool + }{ + { + name: "happy path terminates and cordons two nodes", + objects: []runtime.Object{ec2Node("ip-1", "eu-west-3a", "i-aaa"), ec2Node("ip-2", "eu-west-3b", "i-bbb")}, + nodes: []string{"ip-1", "ip-2"}, + opts: newDrainOpts(false), + wantTerminated: []string{"i-aaa", "i-bbb"}, + wantUnschedulable: map[string]bool{"ip-1": true, "ip-2": true}, + }, + { + name: "dry-run touches nothing", + objects: []runtime.Object{ec2Node("ip-1", "eu-west-3a", "i-aaa")}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(true), + wantUnschedulable: map[string]bool{"ip-1": false}, + }, + { + // Node already gone from K8s ⇒ no instance ID extracted ⇒ + // no TerminateInstances call. + name: "node already gone skips terminate", + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + }, + { + // Non-EC2 providerID can't yield an instance ID, but the + // node is still cordoned and drained. + name: "non-EC2 providerID skips terminate but still cordons", + objects: []runtime.Object{gceNode}, + nodes: []string{"ip-1"}, + opts: newDrainOpts(false), + wantUnschedulable: map[string]bool{"ip-1": true}, + }, + { + // Safety regression: drain failure must prevent the + // instance from being terminated (otherwise the EC2 dies + // mid-grace-period). + name: "drain failure prevents terminate", + objects: []runtime.Object{ec2Node("ip-stuck", "eu-west-3a", "i-aaaaaaaaaaaaaaaaa"), stuckPod()}, + installEvictionReactor: true, + nodes: []string{"ip-stuck"}, + opts: fastDrain, + wantErr: true, + }, + { + // A TerminateInstances failure on one node must propagate as an + // error yet not stop the loop: every drained node's instance is + // still attempted. + name: "terminate failure propagates but loop continues", + objects: []runtime.Object{ec2Node("ip-1", "eu-west-3a", "i-aaa"), ec2Node("ip-2", "eu-west-3b", "i-bbb")}, + nodes: []string{"ip-1", "ip-2"}, + opts: newDrainOpts(false), + stubErr: errors.New("terminate boom"), + wantErr: true, + wantTerminated: []string{"i-aaa", "i-bbb"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + client := fake.NewClientset(tc.objects...) + if tc.installEvictionReactor { + installPodEvictionReactor(client) + } + stub := &stubEC2{err: tc.stubErr} + + err := evictStandalone(t.Context(), client, stub, tc.nodes, tc.opts) + if tc.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + if tc.wantTerminated == nil { + assert.Empty(t, stub.terminated) + } else { + assert.ElementsMatch(t, tc.wantTerminated, stub.terminated) + } + for nodeName, want := range tc.wantUnschedulable { + got, getErr := client.CoreV1().Nodes().Get(t.Context(), nodeName, metav1.GetOptions{}) + require.NoError(t, getErr) + assert.Equal(t, want, got.Spec.Unschedulable, "Spec.Unschedulable for %s", nodeName) + } + }) + } +} + +// TestEvictStandaloneCordonsAllBeforeDraining locks in the cordon-all-up-front +// behavior: every node of the group is cordoned before ANY node starts +// draining, so a pod evicted off one node is never rescheduled onto a sibling +// node that is itself about to be drained. It asserts that at the moment ip-1's +// pod is evicted, the sibling ip-2 has already been cordoned. +func TestEvictStandaloneCordonsAllBeforeDraining(t *testing.T) { + ec2Node := func(name, az, id string) *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: corev1.NodeSpec{ProviderID: "aws:///" + az + "/" + id}, + } + } + pod1 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p1", Namespace: "default"}, + Spec: corev1.PodSpec{NodeName: "ip-1"}, + } + client := fake.NewClientset(ec2Node("ip-1", "eu-west-3a", "i-aaa"), ec2Node("ip-2", "eu-west-3b", "i-bbb"), pod1) + + var ( + recorded bool + node2CordonedAtFirstEviction bool + ) + // On the first pod eviction (ip-1's drain), record whether ip-2 is already + // cordoned, then delete the evicted pod so the node becomes empty and the + // drain completes. Read/mutate via the tracker, not the typed client: + // Fake.Invokes holds a global lock for the whole reaction, so re-entering + // the typed client from inside a reactor would deadlock. + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + ca, ok := action.(clienttesting.CreateAction) + if !ok || ca.GetSubresource() != "eviction" { + return false, nil, nil + } + if !recorded { + recorded = true + if obj, err := client.Tracker().Get(corev1.SchemeGroupVersion.WithResource("nodes"), "", "ip-2"); err == nil { + node2CordonedAtFirstEviction = obj.(*corev1.Node).Spec.Unschedulable + } + } + evic := ca.GetObject().(*policyv1.Eviction) + _ = client.Tracker().Delete(corev1.SchemeGroupVersion.WithResource("pods"), evic.Namespace, evic.Name) + return true, ca.GetObject(), nil + }) + + stub := &stubEC2{} + err := evictStandalone(t.Context(), client, stub, []string{"ip-1", "ip-2"}, newDrainOpts(false)) + require.NoError(t, err) + assert.True(t, node2CordonedAtFirstEviction, "ip-2 must be cordoned before ip-1 starts draining") + assert.ElementsMatch(t, []string{"i-aaa", "i-bbb"}, stub.terminated) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go b/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go index 593b8f767c..f0c8195513 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go @@ -206,57 +206,57 @@ func displayResourceSummary(ctx context.Context, cmd *cobra.Command, cli *client cmd.Println("\nThis will delete:") if n, err := listKarpenterNodePools(ctx, cli); err != nil { - cmd.Printf(" - NodePools: (unable to list: %v)\n", err) + cmd.Printf(" • NodePools: (unable to list: %v)\n", err) } else if len(n) == 0 { - cmd.Println(" - NodePools: none found") + cmd.Println(" • NodePools: none found") } else { nodePools = n - cmd.Printf(" - %d NodePool(s):\n", len(nodePools)) + cmd.Printf(" • %d NodePool(s):\n", len(nodePools)) for _, np := range nodePools { - cmd.Printf(" • %s\n", np) + cmd.Printf(" ◦ %s\n", np) } } nodeClasses, err := listKarpenterEC2NodeClasses(ctx, cli) if err != nil { - cmd.Printf(" - EC2NodeClasses: (unable to list: %v)\n", err) + cmd.Printf(" • EC2NodeClasses: (unable to list: %v)\n", err) } else if len(nodeClasses) == 0 { - cmd.Println(" - EC2NodeClasses: none found") + cmd.Println(" • EC2NodeClasses: none found") } else { - cmd.Printf(" - %d EC2NodeClass(es):\n", len(nodeClasses)) + cmd.Printf(" • %d EC2NodeClass(es):\n", len(nodeClasses)) for _, nc := range nodeClasses { - cmd.Printf(" • %s\n", nc) + cmd.Printf(" ◦ %s\n", nc) } } if err != nil { - cmd.Println(" - Karpenter nodes: (unable to list - depends on EC2NodeClasses)") + cmd.Println(" • Karpenter nodes: (unable to list - depends on EC2NodeClasses)") } else if n, err := listKarpenterNodes(ctx, cli, nodeClasses); err != nil { - cmd.Printf(" - Karpenter nodes: (unable to list: %v)\n", err) + cmd.Printf(" • Karpenter nodes: (unable to list: %v)\n", err) } else if len(n) == 0 { - cmd.Println(" - Karpenter nodes: none found") + cmd.Println(" • Karpenter nodes: none found") } else { nodes = n - cmd.Printf(" - %d Karpenter-managed node(s):\n", len(nodes)) + cmd.Printf(" • %d Karpenter-managed node(s):\n", len(nodes)) for _, node := range nodes { - cmd.Printf(" • %s\n", node) + cmd.Printf(" ◦ %s\n", node) } } - cmd.Println(" - The Karpenter Helm release") + cmd.Println(" • The Karpenter Helm release") if stacks, err := listCloudFormationStacks(ctx, cli, clusterName); err != nil { - cmd.Printf(" - CloudFormation stacks: (unable to list: %v)\n", err) + cmd.Printf(" • CloudFormation stacks: (unable to list: %v)\n", err) } else if len(stacks) == 0 { - cmd.Println(" - CloudFormation stacks: none found") + cmd.Println(" • CloudFormation stacks: none found") } else { - cmd.Printf(" - %d CloudFormation stack(s):\n", len(stacks)) + cmd.Printf(" • %d CloudFormation stack(s):\n", len(stacks)) for _, stack := range stacks { - cmd.Printf(" • %s\n", stack) + cmd.Printf(" ◦ %s\n", stack) } } - cmd.Println(" - aws-auth ConfigMap role mappings (if applicable)") + cmd.Println(" • aws-auth ConfigMap role mappings (if applicable)") if len(nodes) > 0 { cmd.Println()