diff --git a/api-ref/source/os-services.inc b/api-ref/source/os-services.inc index 788179b0a7c..af495b4bac2 100644 --- a/api-ref/source/os-services.inc +++ b/api-ref/source/os-services.inc @@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances will result in a 409 HTTPConflict response. The instances will need to be migrated or deleted before a compute service can be deleted. +Similarly, attempts to delete a ``nova-compute`` service which is involved in +in-progress migrations will result in a 409 HTTPConflict response. The +migrations will need to be completed, for example confirming or reverting a +resize, or the instances will need to be deleted before the compute service can +be deleted. + .. important:: Be sure to stop the actual ``nova-compute`` process on the physical host *before* deleting the service with this API. Failing to do so can lead to the running service re-creating diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index a38f03ca5aa..f6ab02348f4 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -264,6 +264,16 @@ def delete(self, req, id): 'is hosting instances. Migrate or ' 'delete the instances first.')) + # Similarly, check to see if the are any in-progress migrations + # involving this host because if there are we need to block the + # service delete since we could orphan resource providers and + # break the ability to do things like confirm/revert instances + # in VERIFY_RESIZE status. + compute_nodes = objects.ComputeNodeList.get_all_by_host( + context, service.host) + self._assert_no_in_progress_migrations( + context, id, compute_nodes) + aggrs = self.aggregate_api.get_aggregates_by_host(context, service.host) for ag in aggrs: @@ -274,8 +284,6 @@ def delete(self, req, id): # placement for the compute nodes managed by this service; # remember that an ironic compute service can manage multiple # nodes - compute_nodes = objects.ComputeNodeList.get_all_by_host( - context, service.host) for compute_node in compute_nodes: try: self.placementclient.delete_resource_provider( @@ -303,6 +311,36 @@ def delete(self, req, id): explanation = _("Service id %s refers to multiple services.") % id raise webob.exc.HTTPBadRequest(explanation=explanation) + @staticmethod + def _assert_no_in_progress_migrations(context, service_id, compute_nodes): + """Ensures there are no in-progress migrations on the given nodes. + + :param context: nova auth RequestContext + :param service_id: id of the Service being deleted + :param compute_nodes: ComputeNodeList of nodes on a compute service + :raises: HTTPConflict if there are any in-progress migrations on the + nodes + """ + for cn in compute_nodes: + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, cn.host, cn.hypervisor_hostname)) + if migrations: + # Log the migrations for the operator and then raise + # a 409 error. + LOG.info('Unable to delete compute service with id %s ' + 'for host %s. There are %i in-progress ' + 'migrations involving the host. Migrations ' + '(uuid:status): %s', + service_id, cn.host, len(migrations), + ','.join(['%s:%s' % (mig.uuid, mig.status) + for mig in migrations])) + raise webob.exc.HTTPConflict( + explanation=_( + 'Unable to delete compute service that has ' + 'in-progress migrations. Complete the ' + 'migrations or delete the instances first.')) + @validation.query_schema(services.index_query_schema_275, '2.75') @validation.query_schema(services.index_query_schema, '2.0', '2.74') @wsgi.expected_errors(()) diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index e912ddb577d..3674e91cd8a 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -955,6 +955,18 @@ def _confirm_resize(self, server): 'compute_confirm_resize', 'success') return server + def _revert_resize(self, server): + self.api.post_server_action(server['id'], {'revertResize': None}) + server = self._wait_for_state_change(self.api, server, 'ACTIVE') + self._wait_for_migration_status(server, ['reverted']) + # Note that the migration status is changed to "reverted" in the + # dest host revert_resize method but the allocations are cleaned up + # in the source host finish_revert_resize method so we need to wait + # for the finish_revert_resize method to complete. + fake_notifier.wait_for_versioned_notifications( + 'instance.resize_revert.end') + return server + def get_unused_flavor_name_id(self): flavors = self.api.get_flavors() flavor_names = list() diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index 0e4af361d2f..0b640141286 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -194,21 +194,29 @@ def test_migrate_confirm_after_deleted_source_compute(self): # Delete the source compute service. service = self.admin_api.get_services( binary='nova-compute', host='host1')[0] - self.admin_api.api_delete('/os-services/%s' % service['id']) - # FIXME(mriedem): This is bug 1852610 where the compute service is - # deleted but the resource provider is not because there are still - # migration-based allocations against the source node provider. + # We expect the delete request to fail with a 409 error because of the + # instance in VERIFY_RESIZE status even though that instance is marked + # as being on host2 now. + ex = self.assertRaises(api_client.OpenStackApiException, + self.admin_api.api_delete, + '/os-services/%s' % service['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Unable to delete compute service that has in-progress ' + 'migrations', six.text_type(ex)) + self.assertIn('There are 1 in-progress migrations involving the host', + self.stdlog.logger.output) + # The provider is still around because we did not delete the service. resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) self.assertEqual(200, resp.status) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor) # Now try to confirm the migration. - # FIXME(mriedem): This will fail until bug 1852610 is fixed and the - # source compute service delete is blocked while there is an - # in-progress migration involving the node. - self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) - self.api.post_server_action(server['id'], {'confirmResize': None}) - self._wait_for_state_change(self.api, server, 'ERROR') - self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) + self._confirm_resize(server) + # Delete the host1 service since the migration is confirmed and the + # server is on host2. + self.admin_api.api_delete('/os-services/%s' % service['id']) + # The host1 resource provider should be gone. + resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) + self.assertEqual(404, resp.status) def test_resize_revert_after_deleted_source_compute(self): """Tests a scenario where a server is resized and while in @@ -231,25 +239,34 @@ def test_resize_revert_after_deleted_source_compute(self): # Delete the source compute service. service = self.admin_api.get_services( binary='nova-compute', host='host1')[0] - self.admin_api.api_delete('/os-services/%s' % service['id']) - # FIXME(mriedem): This is bug 1852610 where the compute service is - # deleted but the resource provider is not because there are still - # migration-based allocations against the source node provider. + # We expect the delete request to fail with a 409 error because of the + # instance in VERIFY_RESIZE status even though that instance is marked + # as being on host2 now. + ex = self.assertRaises(api_client.OpenStackApiException, + self.admin_api.api_delete, + '/os-services/%s' % service['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Unable to delete compute service that has in-progress ' + 'migrations', six.text_type(ex)) + self.assertIn('There are 1 in-progress migrations involving the host', + self.stdlog.logger.output) + # The provider is still around because we did not delete the service. resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) self.assertEqual(200, resp.status) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) - # Now try to revert the resize. - # NOTE(mriedem): This actually works because the drop_move_claim - # happens in revert_resize on the dest host which still has its - # ComputeNode record. The migration-based allocations are reverted - # so the instance holds the allocations for the source provider and - # the allocations against the dest provider are dropped. - self.api.post_server_action(server['id'], {'revertResize': None}) - self._wait_for_state_change(self.api, server, 'ACTIVE') - self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) + # Now revert the resize. + self._revert_resize(server) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}} self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor) + # Delete the host2 service since the migration is reverted and the + # server is on host1 again. + service2 = self.admin_api.get_services( + binary='nova-compute', host='host2')[0] + self.admin_api.api_delete('/os-services/%s' % service2['id']) + # The host2 resource provider should be gone. + resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid) + self.assertEqual(404, resp.status) class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase): diff --git a/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml b/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml new file mode 100644 index 00000000000..7bee8090c27 --- /dev/null +++ b/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + The ``DELETE /os-services/{service_id}`` compute API will now return a + ``409 HTTPConflict`` response when trying to delete a ``nova-compute`` + service which is involved in in-progress migrations. This is because doing + so would not only orphan the compute node resource provider in the + placement service on which those instances have resource allocations but + can also break the ability to confirm/revert a pending resize properly. + See https://bugs.launchpad.net/nova/+bug/1852610 for more details.