Skip to content

More robust network issues handling #453

@dbalabka

Description

@dbalabka

Describe the issue:
Due to network/DNS issue the following error might appear during cluster creation:

PapermillExecutionError:
---------------------------------------------------------------------------
Exception encountered at "In [4]":
---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1366, in
Http._conn_request(self, conn, request_uri, method, body, headers)
   1365 if conn.sock is None:
-> 1366     conn.connect()
   1367 conn.request(method, request_uri, body, headers)

File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1142, in
HTTPSConnectionWithTimeout.connect(self)
   1140 socket_err = None
-> 1142 address_info = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
   1143 for family, socktype, proto, canonname, sockaddr in address_info:

File /opt/conda/lib/python3.11/socket.py:974, in getaddrinfo(host, port, family,
type, proto, flags)
    973 addrlist = []
--> 974 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    975     af, socktype, proto, canonname, sa = res

gaierror: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

ServerNotFoundError                       Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:286, in
SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop,
security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
    285 try:
--> 286     self.sync(self._correct_state)
    287 except Exception:

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:363, in
SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args,
**kwargs)
    362 else:
--> 363     return sync(
    364         self.loop, func, *args, callback_timeout=callback_timeout,
**kwargs
    365     )

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:439, in
sync(loop, func, callback_timeout, *args, **kwargs)
    438 if error is not None:
--> 439     raise error
    440 else:

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:413, in
sync.<locals>.f()
    412     future = asyncio.ensure_future(awaitable)
--> 413     result = yield future
    414 except Exception as exception:

File /opt/conda/lib/python3.11/site-packages/tornado/gen.py:766, in
Runner.run(self)
    765 try:
--> 766     value = future.result()
    767 except Exception as e:
    768     # Save the exception for later. It's important that
    769     # gen.throw() not be called inside this try/except block
    770     # because that makes sys.exc_info behave unexpectedly.

File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:390, in
SpecCluster._correct_state_internal(self)
    387 # Collect exceptions from failed workers. This must happen after all
    388 # *other* workers have finished initialising, so that we can have a
    389 # proper teardown.
--> 390 await asyncio.gather(*worker_futs)

File /opt/conda/lib/python3.11/asyncio/tasks.py:694, in
_wrap_awaitable(awaitable)
    689 """Helper for asyncio.ensure_future().
    690
    691 Wraps awaitable (an object with __await__) into a coroutine
    692 that will later be wrapped in a Task by ensure_future().
    693 """
--> 694 return (yield from awaitable.__await__())

File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:74, in
ProcessInterface.__await__.<locals>._()
     73 if self.status == Status.created:
---> 74     await self.start()
     75     assert self.status == Status.running

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:374,
in GCPWorker.start(self)
    373 await super().start()
--> 374 await self.start_worker()

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:378,
in GCPWorker.start_worker(self)
    377 self.cluster._log("Creating worker instance")
--> 378 self.internal_ip, self.external_ip = await self.create_vm()
    379 if self.config.get("public_ingress", True):
    380     # scheduler is publicly available

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:220,
in GCPInstance.create_vm(self)
    219     raise Exception(str(e))
--> 220 while await self.update_status() != "RUNNING":
    221     await asyncio.sleep(0.5)

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:256,
in GCPInstance.update_status(self)
    255 async def update_status(self):
--> 256     d = await self.cluster.call_async(
    257         self.cluster.compute.instances()
    258         .list(project=self.projectid, zone=self.zone,
filter=f"name={self.name}")
    259         .execute
    260     )
    261     self.gcp_inst = d

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
321, in VMCluster.call_async(self, f, *args, **kwargs)
    317 [done], _ = await asyncio.wait(
    318     fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
    319     return_when=asyncio.ALL_COMPLETED,
    320 )
--> 321 return done.result()

File /opt/conda/lib/python3.11/concurrent/futures/thread.py:58, in
_WorkItem.run(self)
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
318, in VMCluster.call_async.<locals>.<lambda>()
    309 """Run a blocking function in a thread as a coroutine.
    310
    311 This can only be used to make IO-bound operations non-blocking due to
the GIL.
   (...)    315
    316 """
    317 [done], _ = await asyncio.wait(
--> 318     fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
    319     return_when=asyncio.ALL_COMPLETED,
    320 )
    321 return done.result()

File /opt/conda/lib/python3.11/site-packages/googleapiclient/_helpers.py:130, in
positional.<locals>.positional_decorator.<locals>.positional_wrapper(*args,
**kwargs)
    129         logger.warning(message)
--> 130 return wrapped(*args, **kwargs)

File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:923, in
HttpRequest.execute(self, http, num_retries)
    922 # Handle retries for server-side errors.
--> 923 resp, content = _retry_request(
    924     http,
    925     num_retries,
    926     "request",
    927     self._sleep,
    928     self._rand,
    929     str(self.uri),
    930     method=str(self.method),
    931     body=self.body,
    932     headers=self.headers,
    933 )
    935 for callback in self.response_callbacks:

File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:222, in
_retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args,
**kwargs)
    221 if retry_num == num_retries:
--> 222     raise exception
    223 else:

File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:191, in
_retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args,
**kwargs)
    190     exception = None
--> 191     resp, content = http.request(uri, method, *args, **kwargs)
    192 # Retry on SSL errors and socket timeout errors.

File /opt/conda/lib/python3.11/site-packages/google_auth_httplib2.py:218, in
AuthorizedHttp.request(self, uri, method, body, headers, redirections,
connection_type, **kwargs)
    217 # Make the request.
--> 218 response, content = self.http.request(
    219     uri,
    220     method,
    221     body=body,
    222     headers=request_headers,
    223     redirections=redirections,
    224     connection_type=connection_type,
    225     **kwargs
    226 )
    228 # If the response indicated that the credentials needed to be
    229 # refreshed, then refresh the credentials and re-attempt the
    230 # request.
    231 # A stored token may expire between the time it is retrieved and
    232 # the time the request is made, so we may need to try twice.

File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1724, in
Http.request(self, uri, method, body, headers, redirections, connection_type)
   1723         else:
-> 1724             (response, content) = self._request(
   1725                 conn, authority, uri, request_uri, method, body,
headers, redirections, cachekey,
   1726             )
   1727 except Exception as e:

File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1444, in
Http._request(self, conn, host, absolute_uri, request_uri, method, body,
headers, redirections, cachekey)
   1442     auth.request(method, request_uri, headers, body)
-> 1444 (response, content) = self._conn_request(conn, request_uri, method,
body, headers)
   1446 if auth:

File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1373, in
Http._conn_request(self, conn, request_uri, method, body, headers)
   1372     conn.close()
-> 1373     raise ServerNotFoundError("Unable to find the server at %s" %
conn.host)
   1374 except socket.error as e:

ServerNotFoundError: Unable to find the server at compute.googleapis.com

During handling of the above exception, another exception occurred:

AssertionError                            Traceback (most recent call last)
Cell In[4], line 2
      1 logger.info("Creating a remote cluster.")
----> 2 client, cluster = create_cluster(
      3     machine_type=MACHINE_TYPE,
      4     n_workers=N_WORKERS,
      5     spot=SPOT,
      6 )
      8 # Upload packages
      9 logger.debug("Project root (for package uploading): %s.", project_root)

File /app/demand_forecasting/dask/gcp.py:180, in create_cluster(dask_image_name,
dask_image_tag, pip_packages, n_workers, machine_type, spot, auto_shutdown,
projectid, zone, idle_timeout, private_image, forward_workers_logs, log_level,
auto_scaling, custom_image_id, main_package, force_build_image_only)
    173 if not bootstrap:
    174     #
    175     logger.info(
    176         "Cluster startup boostrap is disabled. Using the custom image
`%s`",
    177         custom_image_id,
    178     )
--> 180 cluster = GCPCluster(
    181     **{
    182         **cluster_params,
    183         # Override default parameters
    184         **{"source_image": custom_image_id, "bootstrap": bootstrap},
    185     }
    186 )
    188 if auto_scaling:
    189     cluster.adapt(minimum=min_workers, maximum=max_workers)

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:642,
in GCPCluster.__init__(self, projectid, zone, network, network_projectid,
machine_type, on_host_maintenance, source_image, docker_image, ngpus, gpu_type,
filesystem_size, disk_type, auto_shutdown, bootstrap, preemptible, debug,
instance_labels, service_account, service_account_credentials, **kwargs)
    639 if "extra_bootstrap" not in kwargs:
    640     kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap")
--> 642 super().__init__(debug=debug, **kwargs)

File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
306, in VMCluster.__init__(self, n_workers, worker_class, worker_options,
scheduler_options, docker_image, docker_args, extra_bootstrap, env_vars,
security, protocol, debug, **kwargs)
    303 self.worker_options["extra_bootstrap"] = extra_bootstrap
    304 self.uuid = str(uuid.uuid4())[:8]
--> 306 super().__init__(**kwargs, security=self.security)

File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:288, in
SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop,
security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
    286     self.sync(self._correct_state)
    287 except Exception:
--> 288     self.sync(self.close)
    289     self._loop_runner.stop()
    290     raise

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:363, in
SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args,
**kwargs)
    361     return future
    362 else:
--> 363     return sync(
    364         self.loop, func, *args, callback_timeout=callback_timeout,
**kwargs
    365     )

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:439, in
sync(loop, func, callback_timeout, *args, **kwargs)
    436         wait(10)
    438 if error is not None:
--> 439     raise error
    440 else:
    441     return result

File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:413, in
sync.<locals>.f()
    411         awaitable = wait_for(awaitable, timeout)
    412     future = asyncio.ensure_future(awaitable)
--> 413     result = yield future
    414 except Exception as exception:
    415     error = exception

File /opt/conda/lib/python3.11/site-packages/tornado/gen.py:766, in
Runner.run(self)
    764 try:
    765     try:
--> 766         value = future.result()
    767     except Exception as e:
    768         # Save the exception for later. It's important that
    769         # gen.throw() not be called inside this try/except block
    770         # because that makes sys.exc_info behave unexpectedly.
    771         exc: Optional[Exception] = e

File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:462, in
SpecCluster._close(self)
    460         await self.scheduler.close()
    461     for w in self._created:
--> 462         assert w.status in {
    463             Status.closing,
    464             Status.closed,
    465             Status.failed,
    466         }, w.status
    468 self.__exit_stack.__exit__(None, None, None)
    469 await super()._close()

AssertionError: Status.running

There should be proper network error handling.

Short summary about the problem and possible solution from the LLM: https://chatgpt.com/share/683d56bd-0ee4-8013-8f81-259a0807b901

Minimal Complete Verifiable Example:

The problem can be reproduced by throwing the exception explicitly by hardocing it in the library.

TBD

Environment:

  • Dask version: 2024.12.1
  • Python version: 3.11
  • Operating System: WSL
  • Install method (conda, pip, source): poetry

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions