Skip to content

Commit

Permalink
Block deleting compute services with in-progress migrations
Browse files Browse the repository at this point in the history
This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a
which made DELETE /os-services/{service_id} fail with a 409
response if the host has instances on it. This change checks
for in-progress migrations involving the nodes on the host,
either as the source or destination nodes, and returns a 409
error response if any are found.

Failling to do this can lead to orphaned resource providers
in placement and also failing to properly confirm or revert
a pending resize or cold migration.

A release note is included for the (justified) behavior
change in the API. A new microversion should not be required
for this since admins should not have to opt out of broken
behavior.

Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c
Closes-Bug: #1852610
  • Loading branch information
mriedem committed Nov 14, 2019
1 parent f7dde60 commit 92fed02
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 26 deletions.
6 changes: 6 additions & 0 deletions api-ref/source/os-services.inc
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
will result in a 409 HTTPConflict response. The instances will need to be
migrated or deleted before a compute service can be deleted.

Similarly, attempts to delete a ``nova-compute`` service which is involved in
in-progress migrations will result in a 409 HTTPConflict response. The
migrations will need to be completed, for example confirming or reverting a
resize, or the instances will need to be deleted before the compute service can
be deleted.

.. important:: Be sure to stop the actual ``nova-compute`` process on the
physical host *before* deleting the service with this API.
Failing to do so can lead to the running service re-creating
Expand Down
42 changes: 40 additions & 2 deletions nova/api/openstack/compute/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ def delete(self, req, id):
'is hosting instances. Migrate or '
'delete the instances first.'))

# Similarly, check to see if the are any in-progress migrations
# involving this host because if there are we need to block the
# service delete since we could orphan resource providers and
# break the ability to do things like confirm/revert instances
# in VERIFY_RESIZE status.
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
self._assert_no_in_progress_migrations(
context, id, compute_nodes)

aggrs = self.aggregate_api.get_aggregates_by_host(context,
service.host)
for ag in aggrs:
Expand All @@ -274,8 +284,6 @@ def delete(self, req, id):
# placement for the compute nodes managed by this service;
# remember that an ironic compute service can manage multiple
# nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes:
try:
self.placementclient.delete_resource_provider(
Expand Down Expand Up @@ -303,6 +311,36 @@ def delete(self, req, id):
explanation = _("Service id %s refers to multiple services.") % id
raise webob.exc.HTTPBadRequest(explanation=explanation)

@staticmethod
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
"""Ensures there are no in-progress migrations on the given nodes.
:param context: nova auth RequestContext
:param service_id: id of the Service being deleted
:param compute_nodes: ComputeNodeList of nodes on a compute service
:raises: HTTPConflict if there are any in-progress migrations on the
nodes
"""
for cn in compute_nodes:
migrations = (
objects.MigrationList.get_in_progress_by_host_and_node(
context, cn.host, cn.hypervisor_hostname))
if migrations:
# Log the migrations for the operator and then raise
# a 409 error.
LOG.info('Unable to delete compute service with id %s '
'for host %s. There are %i in-progress '
'migrations involving the host. Migrations '
'(uuid:status): %s',
service_id, cn.host, len(migrations),
','.join(['%s:%s' % (mig.uuid, mig.status)
for mig in migrations]))
raise webob.exc.HTTPConflict(
explanation=_(
'Unable to delete compute service that has '
'in-progress migrations. Complete the '
'migrations or delete the instances first.'))

@validation.query_schema(services.index_query_schema_275, '2.75')
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
@wsgi.expected_errors(())
Expand Down
12 changes: 12 additions & 0 deletions nova/tests/functional/integrated_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,18 @@ def _confirm_resize(self, server):
'compute_confirm_resize', 'success')
return server

def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server

def get_unused_flavor_name_id(self):
flavors = self.api.get_flavors()
flavor_names = list()
Expand Down
65 changes: 41 additions & 24 deletions nova/tests/functional/wsgi/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,21 +194,29 @@ def test_migrate_confirm_after_deleted_source_compute(self):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
# Now try to confirm the migration.
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
# source compute service delete is blocked while there is an
# in-progress migration involving the node.
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
self.api.post_server_action(server['id'], {'confirmResize': None})
self._wait_for_state_change(self.api, server, 'ERROR')
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
self._confirm_resize(server)
# Delete the host1 service since the migration is confirmed and the
# server is on host2.
self.admin_api.api_delete('/os-services/%s' % service['id'])
# The host1 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(404, resp.status)

def test_resize_revert_after_deleted_source_compute(self):
"""Tests a scenario where a server is resized and while in
Expand All @@ -231,25 +239,34 @@ def test_resize_revert_after_deleted_source_compute(self):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
# Now try to revert the resize.
# NOTE(mriedem): This actually works because the drop_move_claim
# happens in revert_resize on the dest host which still has its
# ComputeNode record. The migration-based allocations are reverted
# so the instance holds the allocations for the source provider and
# the allocations against the dest provider are dropped.
self.api.post_server_action(server['id'], {'revertResize': None})
self._wait_for_state_change(self.api, server, 'ACTIVE')
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
# Now revert the resize.
self._revert_resize(server)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
# Delete the host2 service since the migration is reverted and the
# server is on host1 again.
service2 = self.admin_api.get_services(
binary='nova-compute', host='host2')[0]
self.admin_api.api_delete('/os-services/%s' % service2['id'])
# The host2 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
self.assertEqual(404, resp.status)


class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
fixes:
- |
The ``DELETE /os-services/{service_id}`` compute API will now return a
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
service which is involved in in-progress migrations. This is because doing
so would not only orphan the compute node resource provider in the
placement service on which those instances have resource allocations but
can also break the ability to confirm/revert a pending resize properly.
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.

0 comments on commit 92fed02

Please sign in to comment.