Skip to content

Commit c956a88

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Block deleting compute services with in-progress migrations"
2 parents 236d56e + 92fed02 commit c956a88

File tree

5 files changed

+109
-26
lines changed

5 files changed

+109
-26
lines changed

api-ref/source/os-services.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
349349
will result in a 409 HTTPConflict response. The instances will need to be
350350
migrated or deleted before a compute service can be deleted.
351351

352+
Similarly, attempts to delete a ``nova-compute`` service which is involved in
353+
in-progress migrations will result in a 409 HTTPConflict response. The
354+
migrations will need to be completed, for example confirming or reverting a
355+
resize, or the instances will need to be deleted before the compute service can
356+
be deleted.
357+
352358
.. important:: Be sure to stop the actual ``nova-compute`` process on the
353359
physical host *before* deleting the service with this API.
354360
Failing to do so can lead to the running service re-creating

nova/api/openstack/compute/services.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,16 @@ def delete(self, req, id):
264264
'is hosting instances. Migrate or '
265265
'delete the instances first.'))
266266

267+
# Similarly, check to see if the are any in-progress migrations
268+
# involving this host because if there are we need to block the
269+
# service delete since we could orphan resource providers and
270+
# break the ability to do things like confirm/revert instances
271+
# in VERIFY_RESIZE status.
272+
compute_nodes = objects.ComputeNodeList.get_all_by_host(
273+
context, service.host)
274+
self._assert_no_in_progress_migrations(
275+
context, id, compute_nodes)
276+
267277
aggrs = self.aggregate_api.get_aggregates_by_host(context,
268278
service.host)
269279
for ag in aggrs:
@@ -274,8 +284,6 @@ def delete(self, req, id):
274284
# placement for the compute nodes managed by this service;
275285
# remember that an ironic compute service can manage multiple
276286
# nodes
277-
compute_nodes = objects.ComputeNodeList.get_all_by_host(
278-
context, service.host)
279287
for compute_node in compute_nodes:
280288
try:
281289
self.placementclient.delete_resource_provider(
@@ -303,6 +311,36 @@ def delete(self, req, id):
303311
explanation = _("Service id %s refers to multiple services.") % id
304312
raise webob.exc.HTTPBadRequest(explanation=explanation)
305313

314+
@staticmethod
315+
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
316+
"""Ensures there are no in-progress migrations on the given nodes.
317+
318+
:param context: nova auth RequestContext
319+
:param service_id: id of the Service being deleted
320+
:param compute_nodes: ComputeNodeList of nodes on a compute service
321+
:raises: HTTPConflict if there are any in-progress migrations on the
322+
nodes
323+
"""
324+
for cn in compute_nodes:
325+
migrations = (
326+
objects.MigrationList.get_in_progress_by_host_and_node(
327+
context, cn.host, cn.hypervisor_hostname))
328+
if migrations:
329+
# Log the migrations for the operator and then raise
330+
# a 409 error.
331+
LOG.info('Unable to delete compute service with id %s '
332+
'for host %s. There are %i in-progress '
333+
'migrations involving the host. Migrations '
334+
'(uuid:status): %s',
335+
service_id, cn.host, len(migrations),
336+
','.join(['%s:%s' % (mig.uuid, mig.status)
337+
for mig in migrations]))
338+
raise webob.exc.HTTPConflict(
339+
explanation=_(
340+
'Unable to delete compute service that has '
341+
'in-progress migrations. Complete the '
342+
'migrations or delete the instances first.'))
343+
306344
@validation.query_schema(services.index_query_schema_275, '2.75')
307345
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
308346
@wsgi.expected_errors(())

nova/tests/functional/integrated_helpers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,18 @@ def _confirm_resize(self, server):
962962
'compute_confirm_resize', 'success')
963963
return server
964964

965+
def _revert_resize(self, server):
966+
self.api.post_server_action(server['id'], {'revertResize': None})
967+
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
968+
self._wait_for_migration_status(server, ['reverted'])
969+
# Note that the migration status is changed to "reverted" in the
970+
# dest host revert_resize method but the allocations are cleaned up
971+
# in the source host finish_revert_resize method so we need to wait
972+
# for the finish_revert_resize method to complete.
973+
fake_notifier.wait_for_versioned_notifications(
974+
'instance.resize_revert.end')
975+
return server
976+
965977
def get_unused_flavor_name_id(self):
966978
flavors = self.api.get_flavors()
967979
flavor_names = list()

nova/tests/functional/wsgi/test_services.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -194,21 +194,29 @@ def test_migrate_confirm_after_deleted_source_compute(self):
194194
# Delete the source compute service.
195195
service = self.admin_api.get_services(
196196
binary='nova-compute', host='host1')[0]
197-
self.admin_api.api_delete('/os-services/%s' % service['id'])
198-
# FIXME(mriedem): This is bug 1852610 where the compute service is
199-
# deleted but the resource provider is not because there are still
200-
# migration-based allocations against the source node provider.
197+
# We expect the delete request to fail with a 409 error because of the
198+
# instance in VERIFY_RESIZE status even though that instance is marked
199+
# as being on host2 now.
200+
ex = self.assertRaises(api_client.OpenStackApiException,
201+
self.admin_api.api_delete,
202+
'/os-services/%s' % service['id'])
203+
self.assertEqual(409, ex.response.status_code)
204+
self.assertIn('Unable to delete compute service that has in-progress '
205+
'migrations', six.text_type(ex))
206+
self.assertIn('There are 1 in-progress migrations involving the host',
207+
self.stdlog.logger.output)
208+
# The provider is still around because we did not delete the service.
201209
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
202210
self.assertEqual(200, resp.status)
203211
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
204212
# Now try to confirm the migration.
205-
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
206-
# source compute service delete is blocked while there is an
207-
# in-progress migration involving the node.
208-
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
209-
self.api.post_server_action(server['id'], {'confirmResize': None})
210-
self._wait_for_state_change(self.api, server, 'ERROR')
211-
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
213+
self._confirm_resize(server)
214+
# Delete the host1 service since the migration is confirmed and the
215+
# server is on host2.
216+
self.admin_api.api_delete('/os-services/%s' % service['id'])
217+
# The host1 resource provider should be gone.
218+
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
219+
self.assertEqual(404, resp.status)
212220

213221
def test_resize_revert_after_deleted_source_compute(self):
214222
"""Tests a scenario where a server is resized and while in
@@ -231,25 +239,34 @@ def test_resize_revert_after_deleted_source_compute(self):
231239
# Delete the source compute service.
232240
service = self.admin_api.get_services(
233241
binary='nova-compute', host='host1')[0]
234-
self.admin_api.api_delete('/os-services/%s' % service['id'])
235-
# FIXME(mriedem): This is bug 1852610 where the compute service is
236-
# deleted but the resource provider is not because there are still
237-
# migration-based allocations against the source node provider.
242+
# We expect the delete request to fail with a 409 error because of the
243+
# instance in VERIFY_RESIZE status even though that instance is marked
244+
# as being on host2 now.
245+
ex = self.assertRaises(api_client.OpenStackApiException,
246+
self.admin_api.api_delete,
247+
'/os-services/%s' % service['id'])
248+
self.assertEqual(409, ex.response.status_code)
249+
self.assertIn('Unable to delete compute service that has in-progress '
250+
'migrations', six.text_type(ex))
251+
self.assertIn('There are 1 in-progress migrations involving the host',
252+
self.stdlog.logger.output)
253+
# The provider is still around because we did not delete the service.
238254
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
239255
self.assertEqual(200, resp.status)
240256
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
241-
# Now try to revert the resize.
242-
# NOTE(mriedem): This actually works because the drop_move_claim
243-
# happens in revert_resize on the dest host which still has its
244-
# ComputeNode record. The migration-based allocations are reverted
245-
# so the instance holds the allocations for the source provider and
246-
# the allocations against the dest provider are dropped.
247-
self.api.post_server_action(server['id'], {'revertResize': None})
248-
self._wait_for_state_change(self.api, server, 'ACTIVE')
249-
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
257+
# Now revert the resize.
258+
self._revert_resize(server)
250259
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
251260
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
252261
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
262+
# Delete the host2 service since the migration is reverted and the
263+
# server is on host1 again.
264+
service2 = self.admin_api.get_services(
265+
binary='nova-compute', host='host2')[0]
266+
self.admin_api.api_delete('/os-services/%s' % service2['id'])
267+
# The host2 resource provider should be gone.
268+
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
269+
self.assertEqual(404, resp.status)
253270

254271

255272
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
fixes:
3+
- |
4+
The ``DELETE /os-services/{service_id}`` compute API will now return a
5+
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
6+
service which is involved in in-progress migrations. This is because doing
7+
so would not only orphan the compute node resource provider in the
8+
placement service on which those instances have resource allocations but
9+
can also break the ability to confirm/revert a pending resize properly.
10+
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.

0 commit comments

Comments
 (0)