fix: apply upstream fixes for testing phase issues

endolith · endolith · commit 77882bcfe6b7 · 2025-11-14T01:47:01.000-05:00
Implement changes from runpod-workers/worker-vllm PRs runpod-workers#234, runpod-workers#236, and runpod-workers#138: - Remove space from gpuIds in hub.json (PR runpod-workers#234) - Remove unsupported CUDA versions 12.1-12.4 from hub.json and tests.json (PR runpod-workers#236) - Add error handling for engine initialization and handler exceptions (PR runpod-workers#138 style) - Ensure all errors return proper ErrorResponse format These fixes address GPU allocation, CUDA compatibility, and error handling issues that cause the testing phase to hang indefinitely.
diff --git a/.runpod/hub.json b/.runpod/hub.json
@@ -7,18 +7,14 @@
   "config": {
     "runsOn": "GPU",
     "containerDiskInGb": 150,
-    "gpuIds": "ADA_80_PRO, AMPERE_80",
+    "gpuIds": "ADA_80_PRO,AMPERE_80",
     "gpuCount": 1,
     "allowedCudaVersions": [
       "12.9",
       "12.8",
       "12.7",
       "12.6",
-      "12.5",
-      "12.4",
-      "12.3",
-      "12.2",
-      "12.1"
+      "12.5"
     ],
     "presets": [
       {
diff --git a/src/handler.py b/src/handler.py
@@ -1,22 +1,33 @@
 import os
 import runpod
-from utils import JobInput
+import logging
+from utils import JobInput, create_error_response
 from engine import vLLMEngine, OpenAIvLLMEngine
 
-vllm_engine = vLLMEngine()
-OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine)
+# Initialize engines at module level with error handling
+try:
+    vllm_engine = vLLMEngine()
+    OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine)
+except Exception as e:
+    logging.error(f"Failed to initialize vLLM engines: {e}")
+    raise
 
 async def handler(job):
-    job_input = JobInput(job["input"])
-    engine = OpenAIvLLMEngine if job_input.openai_route else vllm_engine
-    results_generator = engine.generate(job_input)
-    async for batch in results_generator:
-        yield batch
+    try:
+        job_input = JobInput(job["input"])
+        engine = OpenAIvLLMEngine if job_input.openai_route else vllm_engine
+        results_generator = engine.generate(job_input)
+        async for batch in results_generator:
+            yield batch
+    except Exception as e:
+        # Return error in the same format as engine errors
+        logging.error(f"Error in handler: {e}")
+        yield {"error": create_error_response(str(e)).model_dump()}
 
 runpod.serverless.start(
     {
         "handler": handler,
         "concurrency_modifier": lambda x: vllm_engine.max_concurrency,
         "return_aggregate_stream": True,
     }
-)
+)
diff --git a/tests.json b/tests.json