Skip to content

Commit aa77d86

Browse files
committed
feat: Add automatic health checks with retry logic to DeploymentRuntime
1 parent 6d8201d commit aa77d86

File tree

1 file changed

+41
-9
lines changed

1 file changed

+41
-9
lines changed

src/tetra_rp/deployment_runtime/client.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ def __init__(
8888
raise DeploymentRuntimeConfigurationError("endpoint_url is required")
8989

9090
self._session = None
91+
self._health_checked = False
92+
self._health_check_retries = 3
9193

9294
if not self.api_key:
9395
log.warning(
@@ -220,6 +222,9 @@ def decorator(cls):
220222
async def call_remote_method(self, request: FunctionRequest) -> Any:
221223
"""Call remote method via /execute endpoint."""
222224
try:
225+
# Ensure endpoint is healthy before making the request
226+
await self._ensure_healthy()
227+
223228
url = f"{self.endpoint_url}/execute"
224229
payload = {"input": request.model_dump(exclude_none=True)}
225230

@@ -261,6 +266,9 @@ async def call_remote_method(self, request: FunctionRequest) -> Any:
261266
async def call_http_endpoint(self, method_name: str, data: Dict[str, Any]) -> Any:
262267
"""Call HTTP endpoint directly."""
263268
try:
269+
# Ensure endpoint is healthy before making the request
270+
await self._ensure_healthy()
271+
264272
url = f"{self.endpoint_url}/{method_name}"
265273

266274
log.debug(f"HTTP call to {url} for method: {method_name}")
@@ -279,17 +287,41 @@ async def call_http_endpoint(self, method_name: str, data: Dict[str, Any]) -> An
279287
{"method_name": method_name, "error": str(e)},
280288
)
281289

290+
async def _ensure_healthy(self) -> None:
291+
"""Ensure the endpoint is healthy before making requests."""
292+
if self._health_checked:
293+
return
294+
295+
log.debug("Performing automatic health check...")
296+
297+
for attempt in range(self._health_check_retries):
298+
try:
299+
await self._perform_health_check()
300+
self._health_checked = True
301+
log.debug(f"Health check successful on attempt {attempt + 1}")
302+
return
303+
except Exception as e:
304+
if attempt == self._health_check_retries - 1:
305+
log.error(f"Health check failed after {self._health_check_retries} attempts: {e}")
306+
raise DeploymentRuntimeConnectionError(
307+
self.endpoint_url,
308+
f"Endpoint health check failed after {self._health_check_retries} attempts: {e}",
309+
{'attempts': self._health_check_retries, 'last_error': str(e)}
310+
)
311+
else:
312+
log.warning(f"Health check attempt {attempt + 1} failed, retrying...")
313+
await asyncio.sleep(1.0 * (attempt + 1)) # Progressive backoff
314+
315+
async def _perform_health_check(self) -> Dict[str, Any]:
316+
"""Perform a single health check."""
317+
url = f"{self.endpoint_url}/health"
318+
log.debug(f"Health check: {url}")
319+
return await self._make_request_with_retry("GET", url, "health_check")
320+
282321
async def health_check(self) -> Dict[str, Any]:
283-
"""Check DeploymentRuntime health."""
322+
"""Check DeploymentRuntime health (public method)."""
284323
try:
285-
url = f"{self.endpoint_url}/health"
286-
287-
log.debug(f"Health check: {url}")
288-
289-
result = await self._make_request_with_retry("GET", url, "health_check")
290-
291-
return result
292-
324+
return await self._perform_health_check()
293325
except DeploymentRuntimeConnectionError:
294326
raise
295327
except Exception as e:

0 commit comments

Comments
 (0)