HumanSignal
diff --git a/‎adala/runtimes/_litellm.py
+133-49 b/‎adala/runtimes/_litellm.py
+133-49
diff --git a/‎docker-compose.yml
+1-1 b/‎docker-compose.yml
+1-1
diff --git a/‎poetry.lock
+24-24 b/‎poetry.lock
+24-24
diff --git a/‎pyproject.toml
+1-1 b/‎pyproject.toml
+1-1
diff --git a/‎server/README.md
+1-1 b/‎server/README.md
+1-1
diff --git a/‎server/app.py
+1-1 b/‎server/app.py
+1-1
@@ -1,11 +1,17 @@
 import asyncio
 import logging
-from typing import Any, Dict, List, Optional, Union, Type
+from typing import Any, Dict, List, Optional, Type
 
 import litellm
-from litellm.exceptions import AuthenticationError
+from litellm.exceptions import (
+    AuthenticationError,
+    ContentPolicyViolationError,
+    BadRequestError,
+    NotFoundError,
+)
+from litellm.types.utils import Usage
 import instructor
-from instructor.exceptions import InstructorRetryException
+from instructor.exceptions import InstructorRetryException, IncompleteOutputException
 import traceback
 from adala.utils.exceptions import ConstrainedGenerationError
 from adala.utils.internal_data import InternalDataFrame
@@ -14,14 +20,14 @@
     parse_template,
     partial_str_format,
 )
-from openai import NotFoundError
 from pydantic import ConfigDict, field_validator, BaseModel
 from rich import print
 from tenacity import (
     AsyncRetrying,
     Retrying,
     retry_if_not_exception_type,
     stop_after_attempt,
+    wait_random_exponential,
 )
 from pydantic_core._pydantic_core import ValidationError
 
@@ -33,6 +39,25 @@
 logger = logging.getLogger(__name__)
 
 
+# basically only retrying on timeout, incomplete output, or rate limit
+# https://docs.litellm.ai/docs/exception_mapping#custom-mapping-list
+# NOTE: token usage is only correctly calculated if we only use instructor retries, not litellm retries
+# https://github.com/jxnl/instructor/pull/763
+RETRY_POLICY = dict(
+    retry=retry_if_not_exception_type(
+        (
+            ValidationError,
+            ContentPolicyViolationError,
+            AuthenticationError,
+            BadRequestError,
+        )
+    ),
+    # should stop earlier on ValidationError and later on other errors, but couldn't figure out how to do that cleanly
+    stop=stop_after_attempt(3),
+    wait=wait_random_exponential(multiplier=1, max=60),
+)
+
+
 def get_messages(
     user_prompt: str,
     system_prompt: Optional[str] = None,
@@ -59,6 +84,37 @@ def _format_error_dict(e: Exception) -> dict:
     return error_dct
 
 
+def _log_llm_exception(e) -> dict:
+    dct = _format_error_dict(e)
+    base_error = f"Inference error {dct['_adala_message']}"
+    tb = traceback.format_exc()
+    logger.error(f"{base_error}\nTraceback:\n{tb}")
+    return dct
+
+
+def _get_usage_dict(usage: Usage, model: str) -> Dict:
+    data = dict()
+    data["_prompt_tokens"] = usage.prompt_tokens
+    # will not exist if there is no completion
+    data["_completion_tokens"] = usage.get("completion_tokens", 0)
+    # can't use litellm.completion_cost bc it only takes the most recent completion, and .usage is summed over retries
+    # TODO make sure this is calculated correctly after we turn on caching
+    # litellm will register the cost of an azure model on first successful completion. If there hasn't been a successful completion, the model will not be registered
+    try:
+        prompt_cost, completion_cost = litellm.cost_per_token(
+            model, usage.prompt_tokens, usage.get("completion_tokens", 0)
+        )
+        data["_prompt_cost_usd"] = prompt_cost
+        data["_completion_cost_usd"] = completion_cost
+        data["_total_cost_usd"] = prompt_cost + completion_cost
+    except NotFoundError:
+        logger.error(f"Failed to get cost for model {model}")
+        data["_prompt_cost_usd"] = None
+        data["_completion_cost_usd"] = None
+        data["_total_cost_usd"] = None
+    return data
+
+
 class LiteLLMChatRuntime(Runtime):
     """
     Runtime that uses [LiteLLM API](https://litellm.vercel.app/docs) and chat
@@ -173,45 +229,59 @@ def record_to_record(
             instructions_first,
         )
 
-        retries = Retrying(
-            retry=retry_if_not_exception_type((ValidationError)),
-            stop=stop_after_attempt(3),
-        )
+        retries = Retrying(**RETRY_POLICY)
 
         try:
             # returns a pydantic model named Output
-            response = instructor_client.chat.completions.create(
-                messages=messages,
-                response_model=response_model,
-                model=self.model,
-                max_tokens=self.max_tokens,
-                temperature=self.temperature,
-                seed=self.seed,
-                max_retries=retries,
-                # extra inference params passed to this runtime
-                **self.model_extra,
+            response, completion = (
+                instructor_client.chat.completions.create_with_completion(
+                    messages=messages,
+                    response_model=response_model,
+                    model=self.model,
+                    max_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    seed=self.seed,
+                    max_retries=retries,
+                    # extra inference params passed to this runtime
+                    **self.model_extra,
+                )
             )
+            usage = completion.usage
+            dct = response.dict()
+        except IncompleteOutputException as e:
+            usage = e.total_usage
+            dct = _log_llm_exception(e)
         except InstructorRetryException as e:
+            usage = e.total_usage
             # get root cause error from retries
             n_attempts = e.n_attempts
             e = e.__cause__.last_attempt.exception()
-            dct = _format_error_dict(e)
-            print_error(f"Inference error {dct['_adala_message']} after {n_attempts=}")
-            tb = traceback.format_exc()
-            logger.debug(tb)
-            return dct
+            dct = _log_llm_exception(e)
         except Exception as e:
+            # usage = e.total_usage
+            # not available here, so have to approximate by hand, assuming the same error occurred each time
+            n_attempts = retries.stop.max_attempt_number
+            prompt_tokens = n_attempts * litellm.token_counter(
+                model=self.model, messages=messages[:-1]
+            )  # response is appended as the last message
+            # TODO a pydantic validation error may be appended as the last message, don't know how to get the raw response in this case
+            completion_tokens = 0
+            usage = Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=(prompt_tokens + completion_tokens),
+            )
+
             # Catch case where the model does not return a properly formatted output
             if type(e).__name__ == "ValidationError" and "Invalid JSON" in str(e):
                 e = ConstrainedGenerationError()
-            # the only other instructor error that would be thrown is IncompleteOutputException due to max_tokens reached
-            dct = _format_error_dict(e)
-            print_error(f"Inference error {dct['_adala_message']}")
-            tb = traceback.format_exc()
-            logger.debug(tb)
-            return dct
+            # there are no other known errors to catch
+            dct = _log_llm_exception(e)
 
-        return response.dict()
+        # Add usage data to the response (e.g. token counts, cost)
+        dct.update(_get_usage_dict(usage, model=self.model))
+
+        return dct
 
 
 class AsyncLiteLLMChatRuntime(AsyncRuntime):
@@ -304,14 +374,11 @@ async def batch_to_batch(
             axis=1,
         ).tolist()
 
-        retries = AsyncRetrying(
-            retry=retry_if_not_exception_type((ValidationError)),
-            stop=stop_after_attempt(3),
-        )
+        retries = AsyncRetrying(**RETRY_POLICY)
 
         tasks = [
             asyncio.ensure_future(
-                async_instructor_client.chat.completions.create(
+                async_instructor_client.chat.completions.create_with_completion(
                     messages=get_messages(
                         user_prompt,
                         instructions_template,
@@ -334,31 +401,48 @@ async def batch_to_batch(
         # convert list of LLMResponse objects to the dataframe records
         df_data = []
         for response in responses:
-            if isinstance(response, InstructorRetryException):
+            if isinstance(response, IncompleteOutputException):
                 e = response
+                usage = e.total_usage
+                dct = _log_llm_exception(e)
+            elif isinstance(response, InstructorRetryException):
+                e = response
+                usage = e.total_usage
                 # get root cause error from retries
                 n_attempts = e.n_attempts
                 e = e.__cause__.last_attempt.exception()
-                dct = _format_error_dict(e)
-                print_error(
-                    f"Inference error {dct['_adala_message']} after {n_attempts=}"
-                )
-                tb = traceback.format_exc()
-                logger.debug(tb)
-                df_data.append(dct)
+                dct = _log_llm_exception(e)
             elif isinstance(response, Exception):
                 e = response
+                # usage = e.total_usage
+                # not available here, so have to approximate by hand, assuming the same error occurred each time
+                n_attempts = retries.stop.max_attempt_number
+                messages = []  # TODO how to get these?
+                prompt_tokens = n_attempts * litellm.token_counter(
+                    model=self.model, messages=messages[:-1]
+                )  # response is appended as the last message
+                # TODO a pydantic validation error may be appended as the last message, don't know how to get the raw response in this case
+                completion_tokens = 0
+                usage = Usage(
+                    prompt_tokens,
+                    completion_tokens,
+                    total_tokens=(prompt_tokens + completion_tokens),
+                )
+
                 # Catch case where the model does not return a properly formatted output
                 if type(e).__name__ == "ValidationError" and "Invalid JSON" in str(e):
                     e = ConstrainedGenerationError()
                 # the only other instructor error that would be thrown is IncompleteOutputException due to max_tokens reached
-                dct = _format_error_dict(e)
-                print_error(f"Inference error {dct['_adala_message']}")
-                tb = traceback.format_exc()
-                logger.debug(tb)
-                df_data.append(dct)
+                dct = _log_llm_exception(e)
             else:
-                df_data.append(response.dict())
+                resp, completion = response
+                usage = completion.usage
+                dct = resp.dict()
+
+            # Add usage data to the response (e.g. token counts, cost)
+            dct.update(_get_usage_dict(usage, model=self.model))
+
+            df_data.append(dct)
 
         output_df = InternalDataFrame(df_data)
         return output_df.set_index(batch.index)
 
@@ -45,7 +45,7 @@ services:
         condition: service_healthy
     environment:
       - REDIS_URL=redis://redis:6379/0
-      - MODULE_NAME=process_file.app
+      - MODULE_NAME=stream_inference.app
       - KAFKA_BOOTSTRAP_SERVERS=kafka:9093
       - LOG_LEVEL=DEBUG
       - C_FORCE_ROOT=true # needed when using pickle serializer in celery + running as root - remove when we dont run as root
 
@@ -47,7 +47,7 @@ instructor = "^1.3.7"
 [tool.poetry.dev-dependencies]
 pytest = "^7.4.3"
 pytest-cov = "^4.1.0"
-black = "^24.3.0"
+black = "^24.8.0"
 pytest-black = "^0.3.12"
 mkdocs = "^1.5.3"
 mkdocs-jupyter = "^0.24.3"
 
@@ -25,7 +25,7 @@ poetry run uvicorn app:app --host 0.0.0.0 --port 30001
 
 ```bash
 cd tasks/
-poetry run celery -A process_file worker --loglevel=info
+poetry run celery -A stream_inference worker --loglevel=info
 ```
 
 # run in Docker
 
@@ -19,7 +19,7 @@
 
 from server.handlers.result_handlers import ResultHandler
 from server.log_middleware import LogMiddleware
-from server.tasks.process_file import streaming_parent_task
+from server.tasks.stream_inference import streaming_parent_task
 from server.utils import (
     Settings,
     delete_topic,