Skip to content

Commit 27bf91e

Browse files
JorTurFerzroubalik
authored andcommitted
fix: Prevented stuck status due to timeouts during scalers generation (kedacore#5084)
1 parent 4467bb0 commit 27bf91e

File tree

3 files changed

+203
-17
lines changed

3 files changed

+203
-17
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Here is an overview of all new **experimental** features:
6363

6464
### Fixes
6565

66-
- **General**: TODO ([#XXX](https://github.com/kedacore/keda/issues/XXX))
66+
- **General**: Prevented stuck status due to timeouts during scalers generation ([#5083](https://github.com/kedacore/keda/issues/5083))
6767

6868
### Deprecations
6969

pkg/scaling/scale_handler.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -291,36 +291,25 @@ func (h *scaleHandler) getScalersCacheForScaledObject(ctx context.Context, scale
291291
// performGetScalersCache returns cache for input scalableObject, it is common code used by GetScalersCache() and getScalersCacheForScaledObject() methods
292292
func (h *scaleHandler) performGetScalersCache(ctx context.Context, key string, scalableObject interface{}, scalableObjectGeneration *int64, scalableObjectKind, scalableObjectNamespace, scalableObjectName string) (*cache.ScalersCache, error) {
293293
h.scalerCachesLock.RLock()
294+
regenerateCache := false
294295
if cache, ok := h.scalerCaches[key]; ok {
295296
// generation was specified -> let's include it in the check as well
296297
if scalableObjectGeneration != nil {
297298
if cache.ScalableObjectGeneration == *scalableObjectGeneration {
298299
h.scalerCachesLock.RUnlock()
299300
return cache, nil
300301
}
302+
// object was found in cache, but the generation is not correct,
303+
// we'll need to close scalers in the cache and
304+
// proceed further to recreate the cache
305+
regenerateCache = false
301306
} else {
302307
h.scalerCachesLock.RUnlock()
303308
return cache, nil
304309
}
305310
}
306311
h.scalerCachesLock.RUnlock()
307312

308-
h.scalerCachesLock.Lock()
309-
defer h.scalerCachesLock.Unlock()
310-
if cache, ok := h.scalerCaches[key]; ok {
311-
// generation was specified -> let's include it in the check as well
312-
if scalableObjectGeneration != nil {
313-
if cache.ScalableObjectGeneration == *scalableObjectGeneration {
314-
return cache, nil
315-
}
316-
// object was found in cache, but the generation is not correct,
317-
// let's close scalers in the cache and proceed further to recreate the cache
318-
cache.Close(ctx)
319-
} else {
320-
return cache, nil
321-
}
322-
}
323-
324313
if scalableObject == nil {
325314
switch scalableObjectKind {
326315
case "ScaledObject":
@@ -388,6 +377,17 @@ func (h *scaleHandler) performGetScalersCache(ctx context.Context, key string, s
388377
default:
389378
}
390379

380+
// Scalers Close() could be impacted by timeouts, blocking the mutex
381+
// until the timeout happens. Instead of locking the mutex, we take
382+
// the old cache item and we close it in another goroutine, not locking
383+
// the cache: https://github.com/kedacore/keda/issues/5083
384+
if regenerateCache {
385+
oldCache := h.scalerCaches[key]
386+
go oldCache.Close(ctx)
387+
}
388+
389+
h.scalerCachesLock.Lock()
390+
defer h.scalerCachesLock.Unlock()
391391
h.scalerCaches[key] = newCache
392392
return h.scalerCaches[key], nil
393393
}
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
//go:build e2e
2+
// +build e2e
3+
4+
package broken_scaledobject_tolerancy_test
5+
6+
import (
7+
"fmt"
8+
"testing"
9+
10+
"github.com/stretchr/testify/assert"
11+
"k8s.io/client-go/kubernetes"
12+
13+
. "github.com/kedacore/keda/v2/tests/helper"
14+
)
15+
16+
const (
17+
testName = "broken-scaledobject-tolerancy-test"
18+
)
19+
20+
var (
21+
testNamespace = fmt.Sprintf("%s-ns", testName)
22+
deploymentName = fmt.Sprintf("%s-deployment", testName)
23+
monitoredDeploymentName = fmt.Sprintf("%s-monitored", testName)
24+
scaledObjectName = fmt.Sprintf("%s-so", testName)
25+
)
26+
27+
type templateData struct {
28+
TestNamespace string
29+
DeploymentName string
30+
MonitoredDeploymentName string
31+
ScaledObjectName string
32+
}
33+
34+
const (
35+
monitoredDeploymentTemplate = `apiVersion: apps/v1
36+
kind: Deployment
37+
metadata:
38+
name: {{.MonitoredDeploymentName}}
39+
namespace: {{.TestNamespace}}
40+
labels:
41+
deploy: workload-test
42+
spec:
43+
replicas: 0
44+
selector:
45+
matchLabels:
46+
pod: workload-test
47+
template:
48+
metadata:
49+
labels:
50+
pod: workload-test
51+
spec:
52+
containers:
53+
- name: nginx
54+
image: 'nginxinc/nginx-unprivileged'`
55+
56+
deploymentTemplate = `apiVersion: apps/v1
57+
kind: Deployment
58+
metadata:
59+
name: {{.DeploymentName}}
60+
namespace: {{.TestNamespace}}
61+
labels:
62+
deploy: workload-sut
63+
spec:
64+
replicas: 0
65+
selector:
66+
matchLabels:
67+
pod: workload-sut
68+
template:
69+
metadata:
70+
labels:
71+
pod: workload-sut
72+
spec:
73+
containers:
74+
- name: nginx
75+
image: 'nginxinc/nginx-unprivileged'`
76+
77+
brokenScaledObjectTemplate = `
78+
apiVersion: keda.sh/v1alpha1
79+
kind: ScaledObject
80+
metadata:
81+
name: {{.ScaledObjectName}}-broken
82+
namespace: {{.TestNamespace}}
83+
spec:
84+
scaleTargetRef:
85+
name: {{.MonitoredDeploymentName}}
86+
minReplicaCount: 0
87+
maxReplicaCount: 1
88+
triggers:
89+
- metadata:
90+
activationLagThreshold: '1'
91+
bootstrapServers: 1.2.3.4:9092
92+
consumerGroup: earliest
93+
lagThreshold: '1'
94+
offsetResetPolicy: earliest
95+
topic: kafka-topic
96+
type: kafka
97+
`
98+
99+
scaledObjectTemplate = `apiVersion: keda.sh/v1alpha1
100+
kind: ScaledObject
101+
metadata:
102+
name: {{.ScaledObjectName}}
103+
namespace: {{.TestNamespace}}
104+
spec:
105+
scaleTargetRef:
106+
name: {{.DeploymentName}}
107+
pollingInterval: 1
108+
cooldownPeriod: 0
109+
minReplicaCount: 0
110+
maxReplicaCount: 10
111+
advanced:
112+
horizontalPodAutoscalerConfig:
113+
behavior:
114+
scaleDown:
115+
stabilizationWindowSeconds: 5
116+
triggers:
117+
- type: kubernetes-workload
118+
metadata:
119+
podSelector: 'pod=workload-test'
120+
value: '1'
121+
`
122+
)
123+
124+
// As we need to ensure that a broken ScaledObject doesn't impact
125+
// to other ScaledObjects https://github.com/kedacore/keda/issues/5083,
126+
// this test deploys a broken ScaledObject pointing to missing endpoint
127+
// which produces timeouts. In the meantime, we deploy another ScaledObject
128+
// and validate that it works although the broken ScaledObject produces timeouts.
129+
// all the time. This prevents us for introducing deadlocks on internal scalers cache
130+
func TestBrokenScaledObjectTolerance(t *testing.T) {
131+
// setup
132+
t.Log("--- setting up ---")
133+
// Create kubernetes resources
134+
kc := GetKubernetesClient(t)
135+
data, templates := getTemplateData()
136+
137+
CreateKubernetesResources(t, kc, testNamespace, data, templates)
138+
139+
testScaleOut(t, kc)
140+
testScaleIn(t, kc)
141+
142+
// cleanup
143+
DeleteKubernetesResources(t, testNamespace, data, templates)
144+
}
145+
146+
func getTemplateData() (templateData, []Template) {
147+
return templateData{
148+
TestNamespace: testNamespace,
149+
DeploymentName: deploymentName,
150+
ScaledObjectName: scaledObjectName,
151+
MonitoredDeploymentName: monitoredDeploymentName,
152+
}, []Template{
153+
{Name: "deploymentTemplate", Config: deploymentTemplate},
154+
{Name: "monitoredDeploymentTemplate", Config: monitoredDeploymentTemplate},
155+
{Name: "scaledObjectTemplate", Config: scaledObjectTemplate},
156+
{Name: "brokenScaledObjectTemplate", Config: brokenScaledObjectTemplate},
157+
}
158+
}
159+
160+
func testScaleOut(t *testing.T, kc *kubernetes.Clientset) {
161+
// scale monitored deployment to 2 replicas
162+
replicas := 2
163+
KubernetesScaleDeployment(t, kc, monitoredDeploymentName, int64(replicas), testNamespace)
164+
assert.True(t, WaitForDeploymentReplicaReadyCount(t, kc, deploymentName, testNamespace, replicas, 10, 6),
165+
fmt.Sprintf("replica count should be %d after 1 minute", replicas))
166+
167+
// scale monitored deployment to 4 replicas
168+
replicas = 4
169+
KubernetesScaleDeployment(t, kc, monitoredDeploymentName, int64(replicas), testNamespace)
170+
assert.True(t, WaitForDeploymentReplicaReadyCount(t, kc, deploymentName, testNamespace, replicas, 10, 6),
171+
fmt.Sprintf("replica count should be %d after 1 minute", replicas))
172+
}
173+
174+
func testScaleIn(t *testing.T, kc *kubernetes.Clientset) {
175+
// scale monitored deployment to 2 replicas
176+
replicas := 2
177+
KubernetesScaleDeployment(t, kc, monitoredDeploymentName, int64(replicas), testNamespace)
178+
assert.True(t, WaitForDeploymentReplicaReadyCount(t, kc, deploymentName, testNamespace, replicas, 10, 6),
179+
fmt.Sprintf("replica count should be %d after 1 minute", replicas))
180+
181+
// scale monitored deployment to 0 replicas
182+
replicas = 0
183+
KubernetesScaleDeployment(t, kc, monitoredDeploymentName, int64(replicas), testNamespace)
184+
assert.True(t, WaitForDeploymentReplicaReadyCount(t, kc, deploymentName, testNamespace, replicas, 10, 6),
185+
fmt.Sprintf("replica count should be %d after 1 minute", replicas))
186+
}

0 commit comments

Comments
 (0)