Skip to content

Commit 54dc371

Browse files
authored
Merge pull request #407 from A-Kamaee/node-drain-parametrization-2
Introduce configurable parameters for ES client node draining
2 parents 7ba2452 + b2e5752 commit 54dc371

11 files changed

+437
-83
lines changed

README.md

+31-24
Original file line numberDiff line numberDiff line change
@@ -61,34 +61,41 @@ spec:
6161
scaleDownThresholdDurationSeconds: 1800
6262
scaleDownCooldownSeconds: 3600
6363
diskUsagePercentScaledownWatermark: 80
64+
experimental:
65+
draining:
66+
maxRetries: 999
67+
maximumWaitTimeDurationSeconds: 30
68+
minimumWaitTimeDurationSeconds: 10
6469
```
6570

6671
### Custom resource properties
6772

68-
69-
| Key | Description | Type |
70-
|----------|---------------|---------|
71-
| spec.replicas | Initial size of the StatefulSet. If auto-scaling is disabled, this is your desired cluster size. | Int |
72-
| spec.excludeSystemIndices | Enable or disable inclusion of system indices like '.kibana' when calculating shard-per-node ratio and scaling index replica counts. Those are usually managed by Elasticsearch internally. Default is false for backwards compatibility | Boolean |
73-
| spec.skipDraining | Allows the ES Operator to terminate an Elasticsearch node without re-allocating its data. This is useful for persistent disk setups, like EBS volumes. Beware that the ES Operator does not verify that you have more than one copy of your indices and therefore wouldn't protect you from potential data loss. (default=false) | Boolean |
74-
| spec.scaling.enabled | Enable or disable auto-scaling. May be necessary to enforce manual scaling. | Boolean |
75-
| spec.scaling.minReplicas | Minimum Pod replicas. Lower bound (inclusive) when scaling down. | Int |
76-
| spec.scaling.maxReplicas | Maximum Pod replicas. Upper bound (inclusive) when scaling up. | Int |
77-
| spec.scaling.minIndexReplicas | Minimum index replicas. Lower bound (inclusive) when reducing index copies. (reminder: total copies is replicas+1 in Elasticsearch) | Int |
78-
| spec.scaling.maxIndexReplicas | Maximum index replicas. Upper bound (inclusive) when increasing index copies. | Int |
79-
| spec.scaling.minShardsPerNode | Minimum shard per node ratio. When reached, scaling up also requires adding more index replicas. | Int |
80-
| spec.scaling.maxShardsPerNode | Maximum shard per node ratio. Boundary for scaling down. | Int |
81-
| spec.scaling.scaleUpCPUBoundary | (Median) CPU consumption/request ratio to consistently exceed in order to trigger scale up. | Int |
82-
| spec.scaling.scaleUpThresholdDurationSeconds | Duration in seconds required to meet the scale-up criteria before scaling. | Int |
83-
| spec.scaling.scaleUpCooldownSeconds | Minimum duration in seconds between two scale up operations. | Int |
84-
| spec.scaling.scaleDownCPUBoundary | (Median) CPU consumption/request ratio to consistently fall below in order to trigger scale down. | Int |
85-
| spec.scaling.scaleDownThresholdDurationSeconds | Duration in seconds required to meet the scale-down criteria before scaling. | Int |
86-
| spec.scaling.scaleDownCooldownSeconds | Minimum duration in seconds between two scale-down operations. | Int |
87-
| spec.scaling.diskUsagePercentScaledownWatermark | If disk usage on one of the nodes exceeds this threshold, scaling down will be prevented. | Float |
88-
| status.lastScaleUpStarted | Timestamp of start of last scale-up activity | Timestamp |
89-
| status.lastScaleUpEnded | Timestamp of end of last scale-up activity | Timestamp |
90-
| status.lastScaleDownStarted | Timestamp of start of last scale-down activity | Timestamp |
91-
| status.lastScaleDownEnded | Timestamp of end of last scale-down activity | Timestamp |
73+
| Key | Description | Type |
74+
|-----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
75+
| spec.replicas | Initial size of the StatefulSet. If auto-scaling is disabled, this is your desired cluster size. | Int |
76+
| spec.excludeSystemIndices | Enable or disable inclusion of system indices like '.kibana' when calculating shard-per-node ratio and scaling index replica counts. Those are usually managed by Elasticsearch internally. Default is false for backwards compatibility | Boolean |
77+
| spec.skipDraining | Allows the ES Operator to terminate an Elasticsearch node without re-allocating its data. This is useful for persistent disk setups, like EBS volumes. Beware that the ES Operator does not verify that you have more than one copy of your indices and therefore wouldn't protect you from potential data loss. (default=false) | Boolean |
78+
| spec.scaling.enabled | Enable or disable auto-scaling. May be necessary to enforce manual scaling. | Boolean |
79+
| spec.scaling.minReplicas | Minimum Pod replicas. Lower bound (inclusive) when scaling down. | Int |
80+
| spec.scaling.maxReplicas | Maximum Pod replicas. Upper bound (inclusive) when scaling up. | Int |
81+
| spec.scaling.minIndexReplicas | Minimum index replicas. Lower bound (inclusive) when reducing index copies. (reminder: total copies is replicas+1 in Elasticsearch) | Int |
82+
| spec.scaling.maxIndexReplicas | Maximum index replicas. Upper bound (inclusive) when increasing index copies. | Int |
83+
| spec.scaling.minShardsPerNode | Minimum shard per node ratio. When reached, scaling up also requires adding more index replicas. | Int |
84+
| spec.scaling.maxShardsPerNode | Maximum shard per node ratio. Boundary for scaling down. | Int |
85+
| spec.scaling.scaleUpCPUBoundary | (Median) CPU consumption/request ratio to consistently exceed in order to trigger scale up. | Int |
86+
| spec.scaling.scaleUpThresholdDurationSeconds | Duration in seconds required to meet the scale-up criteria before scaling. | Int |
87+
| spec.scaling.scaleUpCooldownSeconds | Minimum duration in seconds between two scale up operations. | Int |
88+
| spec.scaling.scaleDownCPUBoundary | (Median) CPU consumption/request ratio to consistently fall below in order to trigger scale down. | Int |
89+
| spec.scaling.scaleDownThresholdDurationSeconds | Duration in seconds required to meet the scale-down criteria before scaling. | Int |
90+
| spec.scaling.scaleDownCooldownSeconds | Minimum duration in seconds between two scale-down operations. | Int |
91+
| spec.scaling.diskUsagePercentScaledownWatermark | If disk usage on one of the nodes exceeds this threshold, scaling down will be prevented. | Float |
92+
| spec.experimental.draining.maxRetries | MaxRetries specifies the maximum number of attempts to drain a node. | Int |
93+
| spec.experimental.draining.maximumWaitTimeDurationSeconds | MaximumWaitTimeDurationSeconds specifies the maximum wait time in seconds between retry attempts after a failed node drain. | Int |
94+
| spec.experimental.draining.minimumWaitTimeDurationSeconds | MMinimumWaitTimeDurationSeconds specifies the minimum wait time in seconds between retry attempts after a failed node drain. | Int |
95+
| status.lastScaleUpStarted | Timestamp of start of last scale-up activity | Timestamp |
96+
| status.lastScaleUpEnded | Timestamp of end of last scale-up activity | Timestamp |
97+
| status.lastScaleDownStarted |  Timestamp of start of last scale-down activity | Timestamp |
98+
| status.lastScaleDownEnded |  Timestamp of end of last scale-down activity | Timestamp |
9299

93100

94101
## How it scales

cmd/e2e/test_environment.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"net/url"
66
"os"
7+
"time"
78

89
"github.com/sirupsen/logrus"
910
"github.com/zalando-incubator/es-operator/operator"
@@ -90,5 +91,10 @@ func setupESClient(defaultServiceEndpoint, version string) (*operator.ESClient,
9091
if err != nil {
9192
return nil, err
9293
}
93-
return &operator.ESClient{Endpoint: endpoint}, nil
94+
config := &operator.DrainingConfig{
95+
MaxRetries: 999,
96+
MinimumWaitTime: 10 * time.Second,
97+
MaximumWaitTime: 30 * time.Second,
98+
}
99+
return &operator.ESClient{Endpoint: endpoint, DrainingConfig: config}, nil
94100
}

docs/zalando.org_elasticsearchdatasets.yaml

+38
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,44 @@ spec:
5858
description: Exclude management of System Indices on this Data Set.
5959
Defaults to false
6060
type: boolean
61+
experimental:
62+
description: Experimental represents configurations marked as experimental
63+
that may change in future releases. Currently, manages the draining
64+
behavior.
65+
properties:
66+
draining:
67+
description: Draining controls behaviour of the EDS while draining
68+
nodes
69+
properties:
70+
maxRetries:
71+
default: 999
72+
description: MaxRetries specifies the maximum number of attempts
73+
to drain a node. The default value is 999.
74+
format: int32
75+
minimum: 0
76+
type: integer
77+
maximumWaitTimeDurationSeconds:
78+
default: 30
79+
description: MaximumWaitTimeDurationSeconds specifies the
80+
maximum wait time in seconds between retry attempts after
81+
a failed node drain. The default value is 30 seconds.
82+
format: int64
83+
minimum: 0
84+
type: integer
85+
minimumWaitTimeDurationSeconds:
86+
default: 10
87+
description: MinimumWaitTimeDurationSeconds specifies the
88+
minimum wait time in seconds between retry attempts after
89+
a failed node drain. The default value is 10 seconds.
90+
format: int64
91+
minimum: 0
92+
type: integer
93+
required:
94+
- maxRetries
95+
- maximumWaitTimeDurationSeconds
96+
- minimumWaitTimeDurationSeconds
97+
type: object
98+
type: object
6199
replicas:
62100
description: |-
63101
Number of desired pods. This is a pointer to distinguish between explicit

operator/elasticsearch.go

+27-2
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ type operatingEntry struct {
5656
logger *log.Entry
5757
}
5858

59+
// DrainingConfig specifies the configuration settings for the behavior of draining Elasticsearch nodes.
60+
type DrainingConfig struct {
61+
MaxRetries int
62+
MinimumWaitTime time.Duration
63+
MaximumWaitTime time.Duration
64+
}
65+
5966
// NewElasticsearchOperator initializes a new ElasticsearchDataSet operator instance.
6067
func NewElasticsearchOperator(
6168
client *clientset.Clientset,
@@ -241,10 +248,10 @@ func (o *ElasticsearchOperator) runAutoscaler(ctx context.Context) {
241248
for _, es := range resources {
242249
if es.ElasticsearchDataSet.Spec.Scaling != nil && es.ElasticsearchDataSet.Spec.Scaling.Enabled {
243250
endpoint := o.getElasticsearchEndpoint(es.ElasticsearchDataSet)
244-
245251
client := &ESClient{
246252
Endpoint: endpoint,
247253
excludeSystemIndices: es.ElasticsearchDataSet.Spec.ExcludeSystemIndices,
254+
DrainingConfig: o.getDrainingConfig(es.ElasticsearchDataSet),
248255
}
249256

250257
err := o.scaleEDS(ctx, es.ElasticsearchDataSet, es, client)
@@ -676,7 +683,8 @@ func (o *ElasticsearchOperator) operateEDS(eds *zv1.ElasticsearchDataSet, delete
676683

677684
// TODO: abstract this
678685
client := &ESClient{
679-
Endpoint: endpoint,
686+
Endpoint: endpoint,
687+
DrainingConfig: o.getDrainingConfig(eds),
680688
}
681689

682690
operator := &Operator{
@@ -731,6 +739,23 @@ func (o *ElasticsearchOperator) getElasticsearchEndpoint(eds *zv1.ElasticsearchD
731739
}
732740
}
733741

742+
// DrainingConfig returns the draining specification which control how should we handle draining nodes.
743+
func (o *ElasticsearchOperator) getDrainingConfig(eds *zv1.ElasticsearchDataSet) *DrainingConfig {
744+
// Fallback to default configurations if draining configuration is not specified.
745+
if eds.Spec.Experimental == nil || eds.Spec.Experimental.Draining == nil {
746+
return &DrainingConfig{
747+
MaxRetries: 999,
748+
MinimumWaitTime: 10 * time.Second,
749+
MaximumWaitTime: 30 * time.Second,
750+
}
751+
}
752+
return &DrainingConfig{
753+
MaxRetries: int(eds.Spec.Experimental.Draining.MaxRetries),
754+
MinimumWaitTime: time.Duration(eds.Spec.Experimental.Draining.MinimumWaitTimeDurationSeconds) * time.Second,
755+
MaximumWaitTime: time.Duration(eds.Spec.Experimental.Draining.MaximumWaitTimeDurationSeconds) * time.Second,
756+
}
757+
}
758+
734759
type ESResource struct {
735760
ElasticsearchDataSet *zv1.ElasticsearchDataSet
736761
StatefulSet *appsv1.StatefulSet

0 commit comments

Comments
 (0)