Skip to content

Commit 6e76708

Browse files
Automatically detect deleted resources
While executing some Gnocchi optimizations (#1307), we noticed that some deleted/removed resources do not have the "ended_at" field with a datetime. This can cause slowness with time, as more and more "zombie" resources are left there, and this has a direct impact in the MySQL queries executed with the aggregates API. This patch introduces a new parameter called `metric_inactive_after`, which defines for how long a metric can go without receiving new datapoints until we consider it as inactive. Then, when all metrics of a resource are in inactive state, we can mark/consider the resource as removed.
1 parent 57b9693 commit 6e76708

10 files changed

+183
-10
lines changed

gnocchi/chef.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818

1919
import daiquiri
2020
import random
21+
import datetime
2122

2223
from gnocchi import carbonara
2324
from gnocchi import indexer
25+
from gnocchi import utils
2426

2527
LOG = daiquiri.getLogger(__name__)
2628

@@ -51,6 +53,79 @@ def __init__(self, coord, incoming, index, storage):
5153
self.index = index
5254
self.storage = storage
5355

56+
def resource_ended_at_normalization(self, metric_inactive_after):
57+
"""Marks resources as ended at if needed.
58+
59+
This method will check all metrics that have not received new
60+
datapoints after a given period. The period is defined by
61+
'metric_inactive_after' parameter. If all metrics of resource are in
62+
inactive state, we mark the ended_at field with a timestmap. Therefore,
63+
we consider that the resource has ceased existing.
64+
65+
In this process we will handle only metrics that are considered as
66+
inactive, according to `metric_inactive_after` parameter. Therefore,
67+
we do not need to lock these metrics while processing, as they are
68+
inactive, and chances are that they will not receive measures anymore.
69+
"""
70+
71+
momment_now = utils.utcnow()
72+
momment = momment_now - datetime.timedelta(
73+
seconds=metric_inactive_after)
74+
75+
inactive_metrics = self.index.list_metrics(
76+
attribute_filter={"<": {
77+
"last_measure_timestamp": momment}},
78+
resource_policy_filter={"==": {"ended_at": None}}
79+
)
80+
81+
LOG.debug("Inactive metrics found for processing: [%s].",
82+
inactive_metrics)
83+
84+
metrics_by_resource_id = {}
85+
for metric in inactive_metrics:
86+
resource_id = metric.resource_id
87+
if metrics_by_resource_id.get(resource_id) is None:
88+
metrics_by_resource_id[resource_id] = []
89+
90+
metrics_by_resource_id[resource_id].append(metric)
91+
92+
for resource_id in metrics_by_resource_id.keys():
93+
if resource_id is None:
94+
LOG.debug("We do not need to process inactive metrics that do "
95+
"not have resource. Therefore, these metrics [%s] "
96+
"will be considered inactive, but there is nothing "
97+
"else we can do in this process.",
98+
metrics_by_resource_id[resource_id])
99+
continue
100+
resource = self.index.get_resource(
101+
"generic", resource_id, with_metrics=True)
102+
resource_metrics = resource.metrics
103+
resource_inactive_metrics = metrics_by_resource_id.get(resource_id)
104+
105+
all_metrics_are_inactive = True
106+
for m in resource_metrics:
107+
if m not in resource_inactive_metrics:
108+
all_metrics_are_inactive = False
109+
LOG.debug("Not all metrics of resource [%s] are inactive. "
110+
"Metric [%s] is not inactive. The inactive "
111+
"metrics are [%s].",
112+
resource, m, resource_inactive_metrics)
113+
break
114+
115+
if all_metrics_are_inactive:
116+
LOG.info("All metrics [%s] of resource [%s] are inactive."
117+
"Therefore, we will mark it as finished with an"
118+
"ended at timestmap.", resource_metrics, resource)
119+
if resource.ended_at is not None:
120+
LOG.debug(
121+
"Resource [%s] already has an ended at value.", resource)
122+
else:
123+
LOG.info("Marking ended at timestamp for resource "
124+
"[%s] because all of its metrics are inactive.",
125+
resource)
126+
self.index.update_resource(
127+
"generic", resource_id, ended_at=momment_now)
128+
54129
def clean_raw_data_inactive_metrics(self):
55130
"""Cleans metrics raw data if they are inactive.
56131

gnocchi/cli/metricd.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,17 @@ def _run_job(self):
278278
LOG.debug("Finished the cleaning of raw data points for metrics that "
279279
"are no longer receiving measures.")
280280

281+
if (self.conf.metricd.metric_inactive_after and
282+
self.conf.metricd.metric_inactive_after > 0):
283+
LOG.debug("Starting resource ended at field normalization.")
284+
self.chef.resource_ended_at_normalization(
285+
self.conf.metricd.metric_inactive_after)
286+
LOG.debug("Finished resource ended at field normalization.")
287+
else:
288+
LOG.debug("Resource ended at field normalization is not "
289+
"activated. See 'metric_inactive_after' parameter if "
290+
"you wish to activate it.")
291+
281292

282293
class MetricdServiceManager(cotyledon.ServiceManager):
283294
def __init__(self, conf):

gnocchi/indexer/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,11 @@ def update_backwindow_changed_for_metrics_archive_policy(
446446
raise exceptions.NotImplementedError
447447

448448
@staticmethod
449-
def update_needs_raw_data_truncation(metric_id):
449+
def update_needs_raw_data_truncation(metric_id, value):
450+
raise exceptions.NotImplementedError
451+
452+
@staticmethod
453+
def update_last_measure_timestmap(metric_id):
450454
raise exceptions.NotImplementedError
451455

452456
@staticmethod

gnocchi/indexer/alembic/versions/18fff4509e3e_create_column_for_truncate_inactive_metrics_process.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,15 @@
1313
# under the License.
1414
#
1515

16-
"""create metric truncation status column
16+
"""Create metric truncation status column
1717
1818
Revision ID: 18fff4509e3e
1919
Revises: 04eba72e4f90
2020
Create Date: 2024-04-24 09:16:00
2121
2222
"""
23-
import datetime
2423

2524
from alembic import op
26-
from sqlalchemy.sql import func
2725

2826
import sqlalchemy
2927

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2015 OpenStack Foundation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
4+
# not use this file except in compliance with the License. You may obtain
5+
# a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12+
# License for the specific language governing permissions and limitations
13+
# under the License.
14+
#
15+
16+
"""Create last measure push timestamp column
17+
Revision ID: f89ed2e3c2ec
18+
Revises: 18fff4509e3e
19+
Create Date: 2024-04-24 09:16:00
20+
"""
21+
22+
from alembic import op
23+
24+
import sqlalchemy
25+
26+
from sqlalchemy.sql import func
27+
28+
# revision identifiers, used by Alembic.
29+
revision = 'f89ed2e3c2ec'
30+
down_revision = '18fff4509e3e'
31+
branch_labels = None
32+
depends_on = None
33+
34+
35+
def upgrade():
36+
op.add_column(
37+
"metric", sqlalchemy.Column(
38+
"last_measure_timestamp", sqlalchemy.DateTime,
39+
nullable=False, server_default=func.current_timestamp()))

gnocchi/indexer/sqlalchemy.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,6 +1403,13 @@ def update_needs_raw_data_truncation(self, metrid_id, value=False):
14031403
if session.execute(stmt).rowcount == 0:
14041404
raise indexer.NoSuchMetric(metrid_id)
14051405

1406+
def update_last_measure_timestmap(self, metrid_id):
1407+
with self.facade.writer() as session:
1408+
stmt = update(Metric).filter(Metric.id == metrid_id).values(
1409+
last_measure_timestamp=datetime.datetime.utcnow())
1410+
if session.execute(stmt).rowcount == 0:
1411+
raise indexer.NoSuchMetric(metrid_id)
1412+
14061413
def update_backwindow_changed_for_metrics_archive_policy(
14071414
self, archive_policy_name):
14081415
with self.facade.writer() as session:

gnocchi/indexer/sqlalchemy_base.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import sqlalchemy
2020
from sqlalchemy.ext import declarative
2121
from sqlalchemy.orm import declarative_base
22+
from sqlalchemy.sql import func
2223

2324
import sqlalchemy_utils
2425

@@ -113,6 +114,14 @@ class Metric(Base, GnocchiBase, indexer.Metric):
113114
nullable=False, default=True,
114115
server_default=sqlalchemy.sql.true())
115116

117+
# Timestamp that represents when the last measure push was received for the
118+
# given metric. This allows us to identify when a metric ceased receiving
119+
# measurements; thus, if all metric for a resource are in this situation,
120+
# chances are that the resource ceased existing in the backend.
121+
last_measure_timestamp = sqlalchemy.Column(
122+
"last_measure_timestamp", sqlalchemy.DateTime,
123+
nullable=False, server_default=func.current_timestamp())
124+
116125
def jsonify(self):
117126
d = {
118127
"id": self.id,
@@ -256,7 +265,8 @@ def type(cls):
256265
creator = sqlalchemy.Column(sqlalchemy.String(255))
257266
started_at = sqlalchemy.Column(types.TimestampUTC, nullable=False,
258267
default=lambda: utils.utcnow())
259-
revision_start = sqlalchemy.Column(types.TimestampUTC, nullable=False,
268+
revision_start = sqlalchemy.Column(types.TimestampUTC,
269+
nullable=False,
260270
default=lambda: utils.utcnow())
261271
ended_at = sqlalchemy.Column(types.TimestampUTC)
262272
user_id = sqlalchemy.Column(sqlalchemy.String(255))
@@ -298,7 +308,8 @@ class ResourceHistory(ResourceMixin, Base, GnocchiBase):
298308
ondelete="CASCADE",
299309
name="fk_rh_id_resource_id"),
300310
nullable=False)
301-
revision_end = sqlalchemy.Column(types.TimestampUTC, nullable=False,
311+
revision_end = sqlalchemy.Column(types.TimestampUTC,
312+
nullable=False,
302313
default=lambda: utils.utcnow())
303314
metrics = sqlalchemy.orm.relationship(
304315
Metric, primaryjoin="Metric.resource_id == ResourceHistory.id",

gnocchi/opts.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ def __getitem__(self, key):
5757
for opt in _INCOMING_OPTS:
5858
opt.default = '${storage.%s}' % opt.name
5959

60-
6160
API_OPTS = (
6261
cfg.HostAddressOpt('host',
6362
default="0.0.0.0",
@@ -73,7 +72,7 @@ def __getitem__(self, key):
7372
but not chunked encoding (InfluxDB)
7473
* http-socket/socket: support chunked encoding, but require a upstream HTTP
7574
Server for HTTP/1.1, keepalive and HTTP protocol correctness.
76-
""")
75+
"""),
7776
)
7877

7978

@@ -172,7 +171,16 @@ def list_opts():
172171
default=10000,
173172
min=1,
174173
help="Number of metrics that should be deleted "
175-
"simultaneously by one janitor.")
174+
"simultaneously by one janitor."),
175+
cfg.IntOpt('metric_inactive_after',
176+
default=0,
177+
help="Number of seconds to wait before we consider a "
178+
"metric inactive. An inactive metric is a metric "
179+
"that has not received new measurements for a "
180+
"given period. If all metrics of a resource are "
181+
"inactive, we mark the resource with the "
182+
"'ended_at' timestamp. The default is 0 (zero), "
183+
"which means that we never execute process.")
176184
)),
177185
("api", (
178186
cfg.StrOpt('paste_config',

gnocchi/storage/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,26 @@ def _map_compute_splits_operations(bound_timeserie):
688688
if metric.needs_raw_data_truncation:
689689
indexer_driver.update_needs_raw_data_truncation(metric.id)
690690

691+
# Mark when the metric receives its latest measures
692+
indexer_driver.update_last_measure_timestmap(metric.id)
693+
694+
resource_id = metric.resource_id
695+
if resource_id:
696+
resource = indexer_driver.get_resource('generic', resource_id)
697+
LOG.debug("Checking if resource [%s] of metric [%s] with "
698+
"resource ID [%s] needs to be 'undeleted.'",
699+
resource, metric.id, resource_id)
700+
if resource.ended_at is not None:
701+
LOG.info("Resource [%s] was marked with a timestamp for the "
702+
"'ended_at' field. However, it received a "
703+
"measurement for metric [%s]. Therefore, we undelete "
704+
"it.", resource, metric)
705+
indexer_driver.update_resource(
706+
"generic", resource_id, ended_at=None)
707+
else:
708+
LOG.debug("Metric [%s] does not have a resource "
709+
"assigned to it.", metric)
710+
691711
with self.statistics.time("splits delete"):
692712
self._delete_metric_splits(splits_to_delete)
693713
self.statistics["splits delete"] += len(splits_to_delete)

run-upgrade-tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ export GNOCCHI_USER=$GNOCCHI_USER_ID
107107
# needs to be released. Otherwise, the logs stop to be writen, and the
108108
# execution of the code is "frozen", due to the lack of buffer in the
109109
# process output. To work around that, we can read the buffer, and dump it
110-
# into a lof file. Then, we can cat the log file content at the end of the
110+
# into a log file. Then, we can cat the log file content at the end of the
111111
# process.
112112
UWSGI_LOG_FILE=/tmp/uwsgi-new-version.log
113113
METRICD_LOG_FILE=/tmp/gnocchi-metricd-new-version.log

0 commit comments

Comments
 (0)