Skip to content

Commit

Permalink
Merge "Block deleting compute services with in-progress migrations"
Browse files Browse the repository at this point in the history
  • Loading branch information
Zuul authored and openstack-gerrit committed Nov 16, 2019
2 parents 236d56e + 92fed02 commit c956a88
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 26 deletions.
6 changes: 6 additions & 0 deletions api-ref/source/os-services.inc
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
will result in a 409 HTTPConflict response. The instances will need to be
migrated or deleted before a compute service can be deleted.

Similarly, attempts to delete a ``nova-compute`` service which is involved in
in-progress migrations will result in a 409 HTTPConflict response. The
migrations will need to be completed, for example confirming or reverting a
resize, or the instances will need to be deleted before the compute service can
be deleted.

.. important:: Be sure to stop the actual ``nova-compute`` process on the
physical host *before* deleting the service with this API.
Failing to do so can lead to the running service re-creating
Expand Down
42 changes: 40 additions & 2 deletions nova/api/openstack/compute/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ def delete(self, req, id):
'is hosting instances. Migrate or '
'delete the instances first.'))

# Similarly, check to see if the are any in-progress migrations
# involving this host because if there are we need to block the
# service delete since we could orphan resource providers and
# break the ability to do things like confirm/revert instances
# in VERIFY_RESIZE status.
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
self._assert_no_in_progress_migrations(
context, id, compute_nodes)

aggrs = self.aggregate_api.get_aggregates_by_host(context,
service.host)
for ag in aggrs:
Expand All @@ -274,8 +284,6 @@ def delete(self, req, id):
# placement for the compute nodes managed by this service;
# remember that an ironic compute service can manage multiple
# nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes:
try:
self.placementclient.delete_resource_provider(
Expand Down Expand Up @@ -303,6 +311,36 @@ def delete(self, req, id):
explanation = _("Service id %s refers to multiple services.") % id
raise webob.exc.HTTPBadRequest(explanation=explanation)

@staticmethod
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
"""Ensures there are no in-progress migrations on the given nodes.
:param context: nova auth RequestContext
:param service_id: id of the Service being deleted
:param compute_nodes: ComputeNodeList of nodes on a compute service
:raises: HTTPConflict if there are any in-progress migrations on the
nodes
"""
for cn in compute_nodes:
migrations = (
objects.MigrationList.get_in_progress_by_host_and_node(
context, cn.host, cn.hypervisor_hostname))
if migrations:
# Log the migrations for the operator and then raise
# a 409 error.
LOG.info('Unable to delete compute service with id %s '
'for host %s. There are %i in-progress '
'migrations involving the host. Migrations '
'(uuid:status): %s',
service_id, cn.host, len(migrations),
','.join(['%s:%s' % (mig.uuid, mig.status)
for mig in migrations]))
raise webob.exc.HTTPConflict(
explanation=_(
'Unable to delete compute service that has '
'in-progress migrations. Complete the '
'migrations or delete the instances first.'))

@validation.query_schema(services.index_query_schema_275, '2.75')
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
@wsgi.expected_errors(())
Expand Down
12 changes: 12 additions & 0 deletions nova/tests/functional/integrated_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,18 @@ def _confirm_resize(self, server):
'compute_confirm_resize', 'success')
return server

def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server

def get_unused_flavor_name_id(self):
flavors = self.api.get_flavors()
flavor_names = list()
Expand Down
65 changes: 41 additions & 24 deletions nova/tests/functional/wsgi/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,21 +194,29 @@ def test_migrate_confirm_after_deleted_source_compute(self):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
# Now try to confirm the migration.
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
# source compute service delete is blocked while there is an
# in-progress migration involving the node.
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
self.api.post_server_action(server['id'], {'confirmResize': None})
self._wait_for_state_change(self.api, server, 'ERROR')
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
self._confirm_resize(server)
# Delete the host1 service since the migration is confirmed and the
# server is on host2.
self.admin_api.api_delete('/os-services/%s' % service['id'])
# The host1 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(404, resp.status)

def test_resize_revert_after_deleted_source_compute(self):
"""Tests a scenario where a server is resized and while in
Expand All @@ -231,25 +239,34 @@ def test_resize_revert_after_deleted_source_compute(self):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
# Now try to revert the resize.
# NOTE(mriedem): This actually works because the drop_move_claim
# happens in revert_resize on the dest host which still has its
# ComputeNode record. The migration-based allocations are reverted
# so the instance holds the allocations for the source provider and
# the allocations against the dest provider are dropped.
self.api.post_server_action(server['id'], {'revertResize': None})
self._wait_for_state_change(self.api, server, 'ACTIVE')
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
# Now revert the resize.
self._revert_resize(server)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
# Delete the host2 service since the migration is reverted and the
# server is on host1 again.
service2 = self.admin_api.get_services(
binary='nova-compute', host='host2')[0]
self.admin_api.api_delete('/os-services/%s' % service2['id'])
# The host2 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
self.assertEqual(404, resp.status)


class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
fixes:
- |
The ``DELETE /os-services/{service_id}`` compute API will now return a
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
service which is involved in in-progress migrations. This is because doing
so would not only orphan the compute node resource provider in the
placement service on which those instances have resource allocations but
can also break the ability to confirm/revert a pending resize properly.
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.

0 comments on commit c956a88

Please sign in to comment.