-
-
Notifications
You must be signed in to change notification settings - Fork 116
Open
Description
Describe the issue:
Due to network/DNS issue the following error might appear during cluster creation:
PapermillExecutionError:
---------------------------------------------------------------------------
Exception encountered at "In [4]":
---------------------------------------------------------------------------
gaierror Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1366, in
Http._conn_request(self, conn, request_uri, method, body, headers)
1365 if conn.sock is None:
-> 1366 conn.connect()
1367 conn.request(method, request_uri, body, headers)
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1142, in
HTTPSConnectionWithTimeout.connect(self)
1140 socket_err = None
-> 1142 address_info = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1143 for family, socktype, proto, canonname, sockaddr in address_info:
File /opt/conda/lib/python3.11/socket.py:974, in getaddrinfo(host, port, family,
type, proto, flags)
973 addrlist = []
--> 974 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
975 af, socktype, proto, canonname, sa = res
gaierror: [Errno -3] Temporary failure in name resolution
During handling of the above exception, another exception occurred:
ServerNotFoundError Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:286, in
SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop,
security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
285 try:
--> 286 self.sync(self._correct_state)
287 except Exception:
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:363, in
SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args,
**kwargs)
362 else:
--> 363 return sync(
364 self.loop, func, *args, callback_timeout=callback_timeout,
**kwargs
365 )
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:439, in
sync(loop, func, callback_timeout, *args, **kwargs)
438 if error is not None:
--> 439 raise error
440 else:
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:413, in
sync.<locals>.f()
412 future = asyncio.ensure_future(awaitable)
--> 413 result = yield future
414 except Exception as exception:
File /opt/conda/lib/python3.11/site-packages/tornado/gen.py:766, in
Runner.run(self)
765 try:
--> 766 value = future.result()
767 except Exception as e:
768 # Save the exception for later. It's important that
769 # gen.throw() not be called inside this try/except block
770 # because that makes sys.exc_info behave unexpectedly.
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:390, in
SpecCluster._correct_state_internal(self)
387 # Collect exceptions from failed workers. This must happen after all
388 # *other* workers have finished initialising, so that we can have a
389 # proper teardown.
--> 390 await asyncio.gather(*worker_futs)
File /opt/conda/lib/python3.11/asyncio/tasks.py:694, in
_wrap_awaitable(awaitable)
689 """Helper for asyncio.ensure_future().
690
691 Wraps awaitable (an object with __await__) into a coroutine
692 that will later be wrapped in a Task by ensure_future().
693 """
--> 694 return (yield from awaitable.__await__())
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:74, in
ProcessInterface.__await__.<locals>._()
73 if self.status == Status.created:
---> 74 await self.start()
75 assert self.status == Status.running
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:374,
in GCPWorker.start(self)
373 await super().start()
--> 374 await self.start_worker()
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:378,
in GCPWorker.start_worker(self)
377 self.cluster._log("Creating worker instance")
--> 378 self.internal_ip, self.external_ip = await self.create_vm()
379 if self.config.get("public_ingress", True):
380 # scheduler is publicly available
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:220,
in GCPInstance.create_vm(self)
219 raise Exception(str(e))
--> 220 while await self.update_status() != "RUNNING":
221 await asyncio.sleep(0.5)
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:256,
in GCPInstance.update_status(self)
255 async def update_status(self):
--> 256 d = await self.cluster.call_async(
257 self.cluster.compute.instances()
258 .list(project=self.projectid, zone=self.zone,
filter=f"name={self.name}")
259 .execute
260 )
261 self.gcp_inst = d
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
321, in VMCluster.call_async(self, f, *args, **kwargs)
317 [done], _ = await asyncio.wait(
318 fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
319 return_when=asyncio.ALL_COMPLETED,
320 )
--> 321 return done.result()
File /opt/conda/lib/python3.11/concurrent/futures/thread.py:58, in
_WorkItem.run(self)
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
318, in VMCluster.call_async.<locals>.<lambda>()
309 """Run a blocking function in a thread as a coroutine.
310
311 This can only be used to make IO-bound operations non-blocking due to
the GIL.
(...) 315
316 """
317 [done], _ = await asyncio.wait(
--> 318 fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
319 return_when=asyncio.ALL_COMPLETED,
320 )
321 return done.result()
File /opt/conda/lib/python3.11/site-packages/googleapiclient/_helpers.py:130, in
positional.<locals>.positional_decorator.<locals>.positional_wrapper(*args,
**kwargs)
129 logger.warning(message)
--> 130 return wrapped(*args, **kwargs)
File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:923, in
HttpRequest.execute(self, http, num_retries)
922 # Handle retries for server-side errors.
--> 923 resp, content = _retry_request(
924 http,
925 num_retries,
926 "request",
927 self._sleep,
928 self._rand,
929 str(self.uri),
930 method=str(self.method),
931 body=self.body,
932 headers=self.headers,
933 )
935 for callback in self.response_callbacks:
File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:222, in
_retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args,
**kwargs)
221 if retry_num == num_retries:
--> 222 raise exception
223 else:
File /opt/conda/lib/python3.11/site-packages/googleapiclient/http.py:191, in
_retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args,
**kwargs)
190 exception = None
--> 191 resp, content = http.request(uri, method, *args, **kwargs)
192 # Retry on SSL errors and socket timeout errors.
File /opt/conda/lib/python3.11/site-packages/google_auth_httplib2.py:218, in
AuthorizedHttp.request(self, uri, method, body, headers, redirections,
connection_type, **kwargs)
217 # Make the request.
--> 218 response, content = self.http.request(
219 uri,
220 method,
221 body=body,
222 headers=request_headers,
223 redirections=redirections,
224 connection_type=connection_type,
225 **kwargs
226 )
228 # If the response indicated that the credentials needed to be
229 # refreshed, then refresh the credentials and re-attempt the
230 # request.
231 # A stored token may expire between the time it is retrieved and
232 # the time the request is made, so we may need to try twice.
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1724, in
Http.request(self, uri, method, body, headers, redirections, connection_type)
1723 else:
-> 1724 (response, content) = self._request(
1725 conn, authority, uri, request_uri, method, body,
headers, redirections, cachekey,
1726 )
1727 except Exception as e:
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1444, in
Http._request(self, conn, host, absolute_uri, request_uri, method, body,
headers, redirections, cachekey)
1442 auth.request(method, request_uri, headers, body)
-> 1444 (response, content) = self._conn_request(conn, request_uri, method,
body, headers)
1446 if auth:
File /opt/conda/lib/python3.11/site-packages/httplib2/__init__.py:1373, in
Http._conn_request(self, conn, request_uri, method, body, headers)
1372 conn.close()
-> 1373 raise ServerNotFoundError("Unable to find the server at %s" %
conn.host)
1374 except socket.error as e:
ServerNotFoundError: Unable to find the server at compute.googleapis.com
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last)
Cell In[4], line 2
1 logger.info("Creating a remote cluster.")
----> 2 client, cluster = create_cluster(
3 machine_type=MACHINE_TYPE,
4 n_workers=N_WORKERS,
5 spot=SPOT,
6 )
8 # Upload packages
9 logger.debug("Project root (for package uploading): %s.", project_root)
File /app/demand_forecasting/dask/gcp.py:180, in create_cluster(dask_image_name,
dask_image_tag, pip_packages, n_workers, machine_type, spot, auto_shutdown,
projectid, zone, idle_timeout, private_image, forward_workers_logs, log_level,
auto_scaling, custom_image_id, main_package, force_build_image_only)
173 if not bootstrap:
174 #
175 logger.info(
176 "Cluster startup boostrap is disabled. Using the custom image
`%s`",
177 custom_image_id,
178 )
--> 180 cluster = GCPCluster(
181 **{
182 **cluster_params,
183 # Override default parameters
184 **{"source_image": custom_image_id, "bootstrap": bootstrap},
185 }
186 )
188 if auto_scaling:
189 cluster.adapt(minimum=min_workers, maximum=max_workers)
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/gcp/instances.py:642,
in GCPCluster.__init__(self, projectid, zone, network, network_projectid,
machine_type, on_host_maintenance, source_image, docker_image, ngpus, gpu_type,
filesystem_size, disk_type, auto_shutdown, bootstrap, preemptible, debug,
instance_labels, service_account, service_account_credentials, **kwargs)
639 if "extra_bootstrap" not in kwargs:
640 kwargs["extra_bootstrap"] = self.config.get("extra_bootstrap")
--> 642 super().__init__(debug=debug, **kwargs)
File
/opt/conda/lib/python3.11/site-packages/dask_cloudprovider/generic/vmcluster.py:
306, in VMCluster.__init__(self, n_workers, worker_class, worker_options,
scheduler_options, docker_image, docker_args, extra_bootstrap, env_vars,
security, protocol, debug, **kwargs)
303 self.worker_options["extra_bootstrap"] = extra_bootstrap
304 self.uuid = str(uuid.uuid4())[:8]
--> 306 super().__init__(**kwargs, security=self.security)
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:288, in
SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop,
security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
286 self.sync(self._correct_state)
287 except Exception:
--> 288 self.sync(self.close)
289 self._loop_runner.stop()
290 raise
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:363, in
SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args,
**kwargs)
361 return future
362 else:
--> 363 return sync(
364 self.loop, func, *args, callback_timeout=callback_timeout,
**kwargs
365 )
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:439, in
sync(loop, func, callback_timeout, *args, **kwargs)
436 wait(10)
438 if error is not None:
--> 439 raise error
440 else:
441 return result
File /opt/conda/lib/python3.11/site-packages/distributed/utils.py:413, in
sync.<locals>.f()
411 awaitable = wait_for(awaitable, timeout)
412 future = asyncio.ensure_future(awaitable)
--> 413 result = yield future
414 except Exception as exception:
415 error = exception
File /opt/conda/lib/python3.11/site-packages/tornado/gen.py:766, in
Runner.run(self)
764 try:
765 try:
--> 766 value = future.result()
767 except Exception as e:
768 # Save the exception for later. It's important that
769 # gen.throw() not be called inside this try/except block
770 # because that makes sys.exc_info behave unexpectedly.
771 exc: Optional[Exception] = e
File /opt/conda/lib/python3.11/site-packages/distributed/deploy/spec.py:462, in
SpecCluster._close(self)
460 await self.scheduler.close()
461 for w in self._created:
--> 462 assert w.status in {
463 Status.closing,
464 Status.closed,
465 Status.failed,
466 }, w.status
468 self.__exit_stack.__exit__(None, None, None)
469 await super()._close()
AssertionError: Status.running
There should be proper network error handling.
Short summary about the problem and possible solution from the LLM: https://chatgpt.com/share/683d56bd-0ee4-8013-8f81-259a0807b901
Minimal Complete Verifiable Example:
The problem can be reproduced by throwing the exception explicitly by hardocing it in the library.
TBD
Environment:
- Dask version: 2024.12.1
- Python version: 3.11
- Operating System: WSL
- Install method (conda, pip, source): poetry
Metadata
Metadata
Assignees
Labels
No labels