Skip to content

Commit 330c612

Browse files
authored
Fix Evicted Pods Skewing Available Replicas Calculation (#278)
* Ignore evicted pods when calculating ready replicas * Remove unintentional NL * Included PodSucceeded in ignored statuses
1 parent 3b137f3 commit 330c612

File tree

2 files changed

+60
-0
lines changed

2 files changed

+60
-0
lines changed

controllers/datadoghq/replica_calculator.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,22 @@ func (c *ReplicaCalculator) getReadyPodsCount(log logr.Logger, targetName string
525525
incorrectTargetPodsCount++
526526
continue
527527
}
528+
529+
// PodFailed
530+
//During a node-pressure eviction, the kubelet sets the phase for the selected pods to Failed, and terminates
531+
// the Pod. These pods should be ignored because they may not be garbage collected for a long time.
532+
// https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/
533+
534+
// PodSucceeded
535+
// A Deployment’s Pod should never be in PodSucceeded. If it is, it usually means:
536+
// - The Pod is running a one-shot script instead of a service.
537+
// - The restartPolicy is misconfigured.
538+
// - A Job-like process was accidentally set up in a Deployment.
539+
if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded {
540+
incorrectTargetPodsCount++
541+
continue
542+
}
543+
528544
_, condition := getPodCondition(&pod.Status, corev1.PodReady)
529545
// We can't distinguish pods that are past the Readiness in the lifecycle but have not reached it
530546
// and pods that are still Unschedulable but we don't need this level of granularity.

controllers/datadoghq/replica_calculator_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1966,6 +1966,50 @@ func TestTooManyUnreadyPods(t *testing.T) {
19661966
tc.runTest(t)
19671967
}
19681968

1969+
// Pods that have either [PodFailed, PodSucceeded] statuses should not be counted when evaluating the
1970+
// MinAverageReplicaPercentage as they both indicate pods that will never be ready again.
1971+
1972+
// This test makes sure that these pod statuses are ignored when calculating the ready replicas.
1973+
func TestMinAverageReplicaPercentageIgnoresFailedAndSucceededPods(t *testing.T) {
1974+
logf.SetLogger(zap.New())
1975+
metric1 := v1alpha1.MetricSpec{
1976+
Type: v1alpha1.ExternalMetricSourceType,
1977+
External: &v1alpha1.ExternalMetricSource{
1978+
MetricName: "loadbalancer.request.per.seconds",
1979+
MetricSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
1980+
HighWatermark: resource.NewMilliQuantity(10000, resource.DecimalSI),
1981+
LowWatermark: resource.NewMilliQuantity(7000, resource.DecimalSI),
1982+
},
1983+
}
1984+
1985+
tc := replicaCalcTestCase{
1986+
expectedReplicas: 3,
1987+
readyReplicas: 1,
1988+
pos: metricPosition{
1989+
isAbove: true,
1990+
isBelow: false,
1991+
},
1992+
scale: makeScale(testDeploymentName, 3, map[string]string{"name": "test-pod"}),
1993+
wpa: &v1alpha1.WatermarkPodAutoscaler{
1994+
Spec: v1alpha1.WatermarkPodAutoscalerSpec{
1995+
Algorithm: "average",
1996+
MinAvailableReplicaPercentage: 34,
1997+
Tolerance: *resource.NewMilliQuantity(20, resource.DecimalSI),
1998+
Metrics: []v1alpha1.MetricSpec{metric1},
1999+
ReplicaScalingAbsoluteModulo: v1alpha1.NewInt32(1),
2000+
},
2001+
},
2002+
// simulate a pod eviction and a pod
2003+
podPhase: []corev1.PodPhase{corev1.PodRunning, corev1.PodFailed, corev1.PodSucceeded},
2004+
metric: &metricInfo{
2005+
spec: metric1,
2006+
levels: []int64{30000},
2007+
expectedUtilization: 30000, // only 1 ready replica so it's 100% utilized
2008+
},
2009+
}
2010+
tc.runTest(t)
2011+
}
2012+
19692013
// We have pods that are pending and one is within an acceptable window.
19702014
func TestPendingNotExpiredScale(t *testing.T) {
19712015
logf.SetLogger(zap.New())

0 commit comments

Comments
 (0)