BerriAI
diff --git a/‎litellm/__init__.py
Lines changed: 5 additions & 2 deletions b/‎litellm/__init__.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎litellm/_redis.py
Lines changed: 7 additions & 4 deletions b/‎litellm/_redis.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎litellm/budget_manager.py
Lines changed: 12 additions & 4 deletions b/‎litellm/budget_manager.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎litellm/caching/caching.py
Lines changed: 2 additions & 1 deletion b/‎litellm/caching/caching.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎litellm/caching/in_memory_cache.py
Lines changed: 4 additions & 2 deletions b/‎litellm/caching/in_memory_cache.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎litellm/caching/qdrant_semantic_cache.py
Lines changed: 23 additions & 11 deletions b/‎litellm/caching/qdrant_semantic_cache.py
Lines changed: 23 additions & 11 deletions
diff --git a/‎litellm/constants.py
Lines changed: 70 additions & 0 deletions b/‎litellm/constants.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎litellm/cost_calculator.py
Lines changed: 6 additions & 4 deletions b/‎litellm/cost_calculator.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎litellm/integrations/SlackAlerting/slack_alerting.py
Lines changed: 4 additions & 3 deletions b/‎litellm/integrations/SlackAlerting/slack_alerting.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎litellm/integrations/datadog/datadog.py
Lines changed: 1 addition & 1 deletion b/‎litellm/integrations/datadog/datadog.py
Lines changed: 1 addition & 1 deletion
@@ -56,6 +56,9 @@
     bedrock_embedding_models,
     known_tokenizer_config,
     BEDROCK_INVOKE_PROVIDERS_LITERAL,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_SOFT_BUDGET,
+    DEFAULT_ALLOWED_FAILS,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@@ -155,7 +158,7 @@
     str
 ] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 telemetry = True
-max_tokens = 256  # OpenAI Defaults
+max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
 modify_params = False
 retry = True
@@ -244,7 +247,7 @@
     str
 ] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 default_soft_budget: float = (
-    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+    DEFAULT_SOFT_BUDGET  # by default all litellm proxy keys have a soft budget of 50.0
 )
 forward_traceparent_to_llm_provider: bool = False
 
 
@@ -18,6 +18,7 @@
 import redis.asyncio as async_redis  # type: ignore
 
 from litellm import get_secret, get_secret_str
+from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
 
 from ._logging import verbose_logger
 
@@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
     # Set up the Sentinel client
     sentinel = redis.Sentinel(
         sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
         password=sentinel_password,
     )
 
@@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
     # Set up the Sentinel client
     sentinel = async_redis.Sentinel(
         sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
         password=sentinel_password,
     )
 
@@ -319,12 +320,14 @@ def get_redis_connection_pool(**env_overrides):
     verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
     if "url" in redis_kwargs and redis_kwargs["url"] is not None:
         return async_redis.BlockingConnectionPool.from_url(
-            timeout=5, url=redis_kwargs["url"]
+            timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
         )
     connection_class = async_redis.Connection
     if "ssl" in redis_kwargs:
         connection_class = async_redis.SSLConnection
         redis_kwargs.pop("ssl", None)
         redis_kwargs["connection_class"] = connection_class
     redis_kwargs.pop("startup_nodes", None)
-    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
+    return async_redis.BlockingConnectionPool(
+        timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
+    )
@@ -14,6 +14,12 @@
 from typing import Literal, Optional
 
 import litellm
+from litellm.constants import (
+    DAYS_IN_A_MONTH,
+    DAYS_IN_A_WEEK,
+    DAYS_IN_A_YEAR,
+    HOURS_IN_A_DAY,
+)
 from litellm.utils import ModelResponse
 
 
@@ -81,11 +87,11 @@ def create_budget(
         if duration == "daily":
             duration_in_days = 1
         elif duration == "weekly":
-            duration_in_days = 7
+            duration_in_days = DAYS_IN_A_WEEK
         elif duration == "monthly":
-            duration_in_days = 28
+            duration_in_days = DAYS_IN_A_MONTH
         elif duration == "yearly":
-            duration_in_days = 365
+            duration_in_days = DAYS_IN_A_YEAR
         else:
             raise ValueError(
                 """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@@ -182,7 +188,9 @@ def reset_on_duration(self, user: str):
         current_time = time.time()
 
         # Convert duration from days to seconds
-        duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
+        duration_in_seconds = (
+            self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
+        )
 
         # Check if duration has elapsed
         if current_time - last_updated_at >= duration_in_seconds:
 
@@ -19,6 +19,7 @@
 
 import litellm
 from litellm._logging import verbose_logger
+from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
 from litellm.types.utils import all_litellm_params
@@ -406,7 +407,7 @@ def generate_streaming_content(self, content):
                     }
                 ]
             }
-            time.sleep(0.02)
+            time.sleep(CACHED_STREAMING_CHUNK_DELAY)
 
     def _get_cache_logic(
         self,
 
@@ -15,7 +15,8 @@
 
 from pydantic import BaseModel
 
-from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+
 from .base_cache import BaseCache
 
 
@@ -52,7 +53,8 @@ def check_value_size(self, value: Any):
             # Fast path for common primitive types that are typically small
             if (
                 isinstance(value, (bool, int, float, str))
-                and len(str(value)) < self.max_size_per_item * 512
+                and len(str(value))
+                < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
             ):  # Conservative estimate
                 return True
 
 
@@ -11,10 +11,12 @@
 import ast
 import asyncio
 import json
-from typing import Any
+from typing import Any, cast
 
 import litellm
 from litellm._logging import print_verbose
+from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
+from litellm.types.utils import EmbeddingResponse
 
 from .base_cache import BaseCache
 
@@ -118,7 +120,11 @@ def __init__(  # noqa: PLR0915
                 }
             elif quantization_config == "scalar":
                 quantization_params = {
-                    "scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
+                    "scalar": {
+                        "type": "int8",
+                        "quantile": QDRANT_SCALAR_QUANTILE,
+                        "always_ram": False,
+                    }
                 }
             elif quantization_config == "product":
                 quantization_params = {
@@ -132,7 +138,7 @@ def __init__(  # noqa: PLR0915
             new_collection_status = self.sync_client.put(
                 url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
                 json={
-                    "vectors": {"size": 1536, "distance": "Cosine"},
+                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
                     "quantization_config": quantization_params,
                 },
                 headers=self.headers,
@@ -171,10 +177,13 @@ def set_cache(self, key, value, **kwargs):
             prompt += message["content"]
 
         # create an embedding for prompt
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
         )
 
         # get the embedding
@@ -212,10 +221,13 @@ def get_cache(self, key, **kwargs):
             prompt += message["content"]
 
         # convert to embedding
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
         )
 
         # get the embedding
 
@@ -9,23 +9,79 @@
     0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
 DEFAULT_MAX_TOKENS = 4096
+DEFAULT_ALLOWED_FAILS = 3
 DEFAULT_REDIS_SYNC_INTERVAL = 1
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
 DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
+DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
 REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
+MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
+    1024  # minimum number of tokens to cache a prompt by Anthropic
+)
+DEFAULT_TRIM_RATIO = 0.75  # default ratio of tokens to trim from the end of a prompt
+HOURS_IN_A_DAY = 24
+DAYS_IN_A_WEEK = 7
+DAYS_IN_A_MONTH = 28
+DAYS_IN_A_YEAR = 365
+REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
+#### TOKEN COUNTING ####
+FUNCTION_DEFINITION_TOKEN_COUNT = 9
+SYSTEM_MESSAGE_TOKEN_COUNT = 4
+TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
+DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
+DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
+MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
+MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
+MAX_TILE_WIDTH = 512
+MAX_TILE_HEIGHT = 512
+OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
+MIN_NON_ZERO_TEMPERATURE = 0.0001
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
+DEFAULT_MAX_LRU_CACHE_SIZE = 16
+INITIAL_RETRY_DELAY = 0.5
+MAX_RETRY_DELAY = 8.0
+JITTER = 0.75
+DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
+DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
+AZURE_OPERATION_POLLING_TIMEOUT = 120
+REDIS_SOCKET_TIMEOUT = 0.1
+REDIS_CONNECTION_POOL_TIMEOUT = 5
+NON_LLM_CONNECTION_TIMEOUT = 15  # timeout for adjacent services (e.g. jwt auth)
+MAX_EXCEPTION_MESSAGE_LENGTH = 2000
+BEDROCK_MAX_POLICY_SIZE = 75
+REPLICATE_POLLING_DELAY_SECONDS = 0.5
+DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
+TOGETHER_AI_4_B = 4
+TOGETHER_AI_8_B = 8
+TOGETHER_AI_21_B = 21
+TOGETHER_AI_41_B = 41
+TOGETHER_AI_80_B = 80
+TOGETHER_AI_110_B = 110
+TOGETHER_AI_EMBEDDING_150_M = 150
+TOGETHER_AI_EMBEDDING_350_M = 350
+QDRANT_SCALAR_QUANTILE = 0.99
+QDRANT_VECTOR_SIZE = 1536
+CACHED_STREAMING_CHUNK_DELAY = 0.02
+MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
+DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
+### SPEND TRACKING ###
+DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400  # price per second for a100 80GB
+FIREWORKS_AI_56_B_MOE = 56
+FIREWORKS_AI_176_B_MOE = 176
+FIREWORKS_AI_16_B = 16
+FIREWORKS_AI_80_B = 80
 
 LITELLM_CHAT_PROVIDERS = [
     "openai",
@@ -426,6 +482,9 @@
 MAX_SPENDLOG_ROWS_TO_QUERY = (
     1_000_000  # if spendLogs has more than 1M rows, do not query the DB
 )
+DEFAULT_SOFT_BUDGET = (
+    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+)
 # makes it clear this is a rate limit error for a litellm virtual key
 RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
 
@@ -451,3 +510,14 @@
 ########################### DB CRON JOB NAMES ###########################
 DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
 DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60  # 1 minute
+PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
+PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
+PROXY_BATCH_WRITE_AT = 10  # in seconds
+DEFAULT_HEALTH_CHECK_INTERVAL = 300  # 5 minutes
+PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
+DEFAULT_MODEL_CREATED_AT_TIME = 1677610602  # returns on `/models` endpoint
+DEFAULT_SLACK_ALERTING_THRESHOLD = 300
+MAX_TEAM_LIST_LIMIT = 20
+DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
+LENGTH_OF_LITELLM_GENERATED_KEY = 16
+SECRET_MANAGER_REFRESH_INTERVAL = 86400
@@ -9,6 +9,10 @@
 import litellm
 import litellm._logging
 from litellm import verbose_logger
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
+)
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
     StandardBuiltInToolCostTracking,
 )
@@ -355,9 +359,7 @@ def cost_per_token(  # noqa: PLR0915
 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
     # see https://replicate.com/pricing
     # for all litellm currently supported LLMs, almost all requests go to a100_80gb
-    a100_80gb_price_per_second_public = (
-        0.001400  # assume all calls sent to A100 80GB for now
-    )
+    a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND  # assume all calls sent to A100 80GB for now
     if total_time == 0.0:  # total time is in ms
         start_time = completion_response.get("created", time.time())
         end_time = getattr(completion_response, "ended", time.time())
@@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
     return return_model
 
 
-@lru_cache(maxsize=16)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _model_contains_known_llm_provider(model: str) -> bool:
     """
     Check if the model contains a known llm provider
 
@@ -16,6 +16,7 @@
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
 from litellm.caching.caching import DualCache
+from litellm.constants import HOURS_IN_A_DAY
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.litellm_core_utils.exception_mapping_utils import (
@@ -649,10 +650,10 @@ async def budget_alerts(  # noqa: PLR0915
                 event_message += (
                     f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
                 )
-            elif percent_left <= 0.05:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
                 event = "threshold_crossed"
                 event_message += "5% Threshold Crossed "
-            elif percent_left <= 0.15:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
                 event = "threshold_crossed"
                 event_message += "15% Threshold Crossed"
         elif user_info.soft_budget is not None:
@@ -1718,7 +1719,7 @@ async def send_monthly_spend_report(self):
             await self.internal_usage_cache.async_set_cache(
                 key=_event_cache_key,
                 value="SENT",
-                ttl=(30 * 24 * 60 * 60),  # 1 month
+                ttl=(30 * HOURS_IN_A_DAY * 60 * 60),  # 1 month
             )
 
         except Exception as e:
 
@@ -41,7 +41,7 @@
 from ..additional_logging_utils import AdditionalLoggingUtils
 
 # max number of logs DD API can accept
-DD_MAX_BATCH_SIZE = 1000
+
 
 # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
 DD_LOGGED_SUCCESS_SERVICE_TYPES = [
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`
`20`	`20`	`import litellm`
`21`	`21`	`from litellm._logging import verbose_logger`
	`22`	`+from litellm.constants import CACHED_STREAMING_CHUNK_DELAY`
`22`	`23`	`from litellm.litellm_core_utils.model_param_helper import ModelParamHelper`
`23`	`24`	`from litellm.types.caching import *`
`24`	`25`	`from litellm.types.utils import all_litellm_params`
`@@ -406,7 +407,7 @@ def generate_streaming_content(self, content):`
`406`	`407`	`}`
`407`	`408`	`]`
`408`	`409`	`}`
`409`		`- time.sleep(0.02)`
	`410`	`+ time.sleep(CACHED_STREAMING_CHUNK_DELAY)`
`410`	`411`
`411`	`412`	`def _get_cache_logic(`
`412`	`413`	`self,`