From 61a528311de710edceb23955b0f49358682760b0 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 22 May 2019 18:30:10 -0400 Subject: [PATCH] Remove PlacementAPIConnectFailure handling from AggregateAPI Change Ibd7aa4f8c4ea787774becece324d9051521c44b6 in Rocky started mirroring aggregate membership changes to the placement service but to ease upgrades (because before that nova-api didn't rely on placement) the API code would handle failures to connect to the placement service. A TODO was added to remove that handling in Stein, so it's time to remove it. Note that with the removal of that connection error handling, the API response in both add host to aggregate and remove host from aggregate APIs will be a 500. An error in add_host_to_aggregate should be remedied by running the "nova-manage placement sync_aggregates" command but a failure in remove_host_from_aggregate will not be [1]. As such the following in the remove_host_from_aggregate operation is changed: * ResourceProviderNotFound is still ignored since if we get that the provider likely no longer exists and is not in an aggregate anyway. * ResourceProviderUpdateConflict is handled in the route handler to return a 409 rather than 500 error. Anything else raised from SchedulerReportClient.aggregate_remove_host results in a 500 error. * The placement work is done first because if that fails the admin can try again from the compute API. Otherwise if we do the placement cleanup after Aggregate.delete_host is called in nova, the admin cannot retry the remove_host action because it will fail with AggregateHostNotFound. [1] https://opendev.org/openstack/nova/src/tag/20.0.0/nova/cmd/manage.py#L2533 Change-Id: I4dfbed3e41c4952efc2900a063cb22753539d6cb --- nova/api/openstack/compute/aggregates.py | 3 +- nova/compute/api.py | 51 ++++++------------- .../api/openstack/compute/test_aggregates.py | 12 +++++ nova/tests/unit/compute/test_host_api.py | 42 +++++++-------- 4 files changed, 49 insertions(+), 59 deletions(-) diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py index ede4813d73c..6e709011128 100644 --- a/nova/api/openstack/compute/aggregates.py +++ b/nova/api/openstack/compute/aggregates.py @@ -178,7 +178,8 @@ def _remove_host(self, req, id, body): msg = _('Cannot remove host %(host)s in aggregate %(id)s') % { 'host': host, 'id': id} raise exc.HTTPNotFound(explanation=msg) - except exception.InvalidAggregateAction: + except (exception.InvalidAggregateAction, + exception.ResourceProviderUpdateConflict): msg = _('Cannot remove host %(host)s in aggregate %(id)s') % { 'host': host, 'id': id} raise exc.HTTPConflict(explanation=msg) diff --git a/nova/compute/api.py b/nova/compute/api.py index 8821680f105..06d0c352469 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -5719,16 +5719,6 @@ def add_host_to_aggregate(self, context, aggregate_id, host_name): try: self.placement_client.aggregate_add_host( context, aggregate.uuid, host_name=host_name) - except exception.PlacementAPIConnectFailure: - # NOTE(jaypipes): Rocky should be able to tolerate the nova-api - # service not communicating with the Placement API, so just log a - # warning here. - # TODO(jaypipes): Remove this in Stein, when placement must be able - # to be contacted from the nova-api service. - LOG.warning("Failed to associate %s with a placement " - "aggregate: %s. There was a failure to communicate " - "with the placement service.", - host_name, aggregate.uuid) except (exception.ResourceProviderNotFound, exception.ResourceProviderAggregateRetrievalFailed, exception.ResourceProviderUpdateFailed, @@ -5780,35 +5770,26 @@ def remove_host_from_aggregate(self, context, aggregate_id, host_name): action=fields_obj.NotificationAction.REMOVE_HOST, phase=fields_obj.NotificationPhase.START) - aggregate.delete_host(host_name) - self.query_client.update_aggregates(context, [aggregate]) + # Remove the resource provider from the provider aggregate first before + # we change anything on the nova side because if we did the nova stuff + # first we can't re-attempt this from the compute API if cleaning up + # placement fails. try: + # Anything else this raises is handled in the route handler as + # either a 409 (ResourceProviderUpdateConflict) or 500. self.placement_client.aggregate_remove_host( context, aggregate.uuid, host_name) - except exception.PlacementAPIConnectFailure: - # NOTE(jaypipes): Rocky should be able to tolerate the nova-api - # service not communicating with the Placement API, so just log a - # warning here. - # TODO(jaypipes): Remove this in Stein, when placement must be able - # to be contacted from the nova-api service. - LOG.warning("Failed to remove association of %s with a placement " - "aggregate: %s. There was a failure to communicate " - "with the placement service.", - host_name, aggregate.uuid) - except (exception.ResourceProviderNotFound, - exception.ResourceProviderAggregateRetrievalFailed, - exception.ResourceProviderUpdateFailed, - exception.ResourceProviderUpdateConflict) as err: - # NOTE(jaypipes): We don't want a failure perform the mirroring - # action in the placement service to be returned to the user (they - # probably don't know anything about the placement service and - # would just be confused). So, we just log a warning here, noting - # that on the next run of nova-manage placement sync_aggregates - # things will go back to normal + except exception.ResourceProviderNotFound as err: + # If the resource provider is not found then it's likely not part + # of the aggregate anymore anyway since provider aggregates are + # not resources themselves with metadata like nova aggregates, they + # are just a grouping concept around resource providers. Log and + # continue. LOG.warning("Failed to remove association of %s with a placement " - "aggregate: %s. This may be corrected after running " - "nova-manage placement sync_aggregates.", - host_name, err) + "aggregate: %s.", host_name, err) + + aggregate.delete_host(host_name) + self.query_client.update_aggregates(context, [aggregate]) self._update_az_cache_for_host(context, host_name, aggregate.metadata) self.compute_rpcapi.remove_aggregate_host(context, aggregate=aggregate, host_param=host_name, host=host_name) diff --git a/nova/tests/unit/api/openstack/compute/test_aggregates.py b/nova/tests/unit/api/openstack/compute/test_aggregates.py index eb185849a64..3bf1c838755 100644 --- a/nova/tests/unit/api/openstack/compute/test_aggregates.py +++ b/nova/tests/unit/api/openstack/compute/test_aggregates.py @@ -588,6 +588,18 @@ def test_remove_host_with_missing_host_empty(self): eval(self.remove_host), self.req, "1", body={"remove_host": {}}) + def test_remove_host_resource_provider_update_conflict(self): + """Tests that ResourceProviderUpdateConflict is handled as a 409.""" + side_effect = exception.ResourceProviderUpdateConflict( + uuid=uuidsentinel.provider_id, generation=1, error='try again!') + with mock.patch.object(self.controller.api, + 'remove_host_from_aggregate', + side_effect=side_effect) as mock_rem: + self.assertRaises(exc.HTTPConflict, eval(self.remove_host), + self.req, "1", + body={"remove_host": {"host": "bogushost"}}) + mock_rem.assert_called_once_with(self.context, "1", "bogushost") + def test_set_metadata(self): body = {"set_metadata": {"metadata": {"foo": "bar"}}} with mock.patch.object(self.controller.api, diff --git a/nova/tests/unit/compute/test_host_api.py b/nova/tests/unit/compute/test_host_api.py index e986790b130..09239879ae8 100644 --- a/nova/tests/unit/compute/test_host_api.py +++ b/nova/tests/unit/compute/test_host_api.py @@ -653,42 +653,36 @@ def test_aggregate_add_host_placement_missing_provider( @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'aggregate_add_host') - @mock.patch.object(compute.LOG, 'warning') - def test_aggregate_add_host_bad_placement( - self, mock_log, mock_pc_add_host): + def test_aggregate_add_host_bad_placement(self, mock_pc_add_host): hostname = 'fake-host' mock_pc_add_host.side_effect = exception.PlacementAPIConnectFailure aggregate = self.aggregate_api.create_aggregate( self.ctxt, 'aggregate', None) agg_uuid = aggregate.uuid - self.aggregate_api.add_host_to_aggregate( - self.ctxt, aggregate.id, hostname) - # Nothing should blow up in Rocky, but we should get a warning about - # placement connectivity failure - msg = ("Failed to associate %s with a placement " - "aggregate: %s. There was a failure to communicate " - "with the placement service.") - mock_log.assert_called_with(msg, hostname, agg_uuid) + self.assertRaises(exception.PlacementAPIConnectFailure, + self.aggregate_api.add_host_to_aggregate, + self.ctxt, aggregate.id, hostname) + mock_pc_add_host.assert_called_once_with( + self.ctxt, agg_uuid, host_name=hostname) @mock.patch('nova.objects.Aggregate.delete_host') @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'aggregate_remove_host') - @mock.patch.object(compute.LOG, 'warning') def test_aggregate_remove_host_bad_placement( - self, mock_log, mock_pc_remove_host, mock_agg_obj_delete_host): + self, mock_pc_remove_host, mock_agg_obj_delete_host): hostname = 'fake-host' mock_pc_remove_host.side_effect = exception.PlacementAPIConnectFailure aggregate = self.aggregate_api.create_aggregate( self.ctxt, 'aggregate', None) agg_uuid = aggregate.uuid - self.aggregate_api.remove_host_from_aggregate( - self.ctxt, aggregate.id, hostname) - # Nothing should blow up in Rocky, but we should get a warning about - # placement connectivity failure - msg = ("Failed to remove association of %s with a placement " - "aggregate: %s. There was a failure to communicate " - "with the placement service.") - mock_log.assert_called_with(msg, hostname, agg_uuid) + self.assertRaises(exception.PlacementAPIConnectFailure, + self.aggregate_api.remove_host_from_aggregate, + self.ctxt, aggregate.id, hostname) + mock_pc_remove_host.assert_called_once_with( + self.ctxt, agg_uuid, hostname) + # Make sure mock_agg_obj_delete_host wasn't called since placement + # should be tried first and failed with a server failure. + mock_agg_obj_delete_host.assert_not_called() @mock.patch('nova.objects.Aggregate.delete_host') @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' @@ -705,6 +699,8 @@ def test_aggregate_remove_host_placement_missing_provider( self.ctxt, aggregate.id, hostname) # Nothing should blow up in Rocky, but we should get a warning msg = ("Failed to remove association of %s with a placement " - "aggregate: %s. This may be corrected after running " - "nova-manage placement sync_aggregates.") + "aggregate: %s.") mock_log.assert_called_with(msg, hostname, err) + # In this case Aggregate.delete_host is still called because the + # ResourceProviderNotFound error is just logged. + mock_agg_obj_delete_host.assert_called_once_with(hostname)