Skip to content

Commit 8ee3229

Browse files
Squashed commit of the following: (#9709)
commit b12a989 Author: Krrish Dholakia <[email protected]> Date: Wed Apr 2 08:09:56 2025 -0700 fix(utils.py): don't modify openai_token_counter commit 294de31 Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 21:22:40 2025 -0700 fix: fix linting error commit cb6e9fb Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 19:52:45 2025 -0700 refactor: complete migration commit bfc1591 Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 19:09:59 2025 -0700 refactor: refactor more constants commit 43ffb6a Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 18:45:24 2025 -0700 fix: test commit 04dbe43 Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 18:28:58 2025 -0700 refactor: refactor: move more constants into constants.py commit 3c26284 Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 18:14:46 2025 -0700 refactor: migrate hardcoded constants out of __init__.py commit c11e0de Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 18:11:21 2025 -0700 build: migrate all constants into constants.py commit 7882bdc Author: Krrish Dholakia <[email protected]> Date: Mon Mar 24 18:07:37 2025 -0700 build: initial test banning hardcoded numbers in repo
1 parent 5a722ef commit 8ee3229

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+509
-118
lines changed

litellm/__init__.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@
5656
bedrock_embedding_models,
5757
known_tokenizer_config,
5858
BEDROCK_INVOKE_PROVIDERS_LITERAL,
59+
DEFAULT_MAX_TOKENS,
60+
DEFAULT_SOFT_BUDGET,
61+
DEFAULT_ALLOWED_FAILS,
5962
)
6063
from litellm.types.guardrails import GuardrailItem
6164
from litellm.proxy._types import (
@@ -155,7 +158,7 @@
155158
str
156159
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
157160
telemetry = True
158-
max_tokens = 256 # OpenAI Defaults
161+
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
159162
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
160163
modify_params = False
161164
retry = True
@@ -244,7 +247,7 @@
244247
str
245248
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
246249
default_soft_budget: float = (
247-
50.0 # by default all litellm proxy keys have a soft budget of 50.0
250+
DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
248251
)
249252
forward_traceparent_to_llm_provider: bool = False
250253

litellm/_redis.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import redis.asyncio as async_redis # type: ignore
1919

2020
from litellm import get_secret, get_secret_str
21+
from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
2122

2223
from ._logging import verbose_logger
2324

@@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
215216
# Set up the Sentinel client
216217
sentinel = redis.Sentinel(
217218
sentinel_nodes,
218-
socket_timeout=0.1,
219+
socket_timeout=REDIS_SOCKET_TIMEOUT,
219220
password=sentinel_password,
220221
)
221222

@@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
239240
# Set up the Sentinel client
240241
sentinel = async_redis.Sentinel(
241242
sentinel_nodes,
242-
socket_timeout=0.1,
243+
socket_timeout=REDIS_SOCKET_TIMEOUT,
243244
password=sentinel_password,
244245
)
245246

@@ -319,12 +320,14 @@ def get_redis_connection_pool(**env_overrides):
319320
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
320321
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
321322
return async_redis.BlockingConnectionPool.from_url(
322-
timeout=5, url=redis_kwargs["url"]
323+
timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
323324
)
324325
connection_class = async_redis.Connection
325326
if "ssl" in redis_kwargs:
326327
connection_class = async_redis.SSLConnection
327328
redis_kwargs.pop("ssl", None)
328329
redis_kwargs["connection_class"] = connection_class
329330
redis_kwargs.pop("startup_nodes", None)
330-
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
331+
return async_redis.BlockingConnectionPool(
332+
timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
333+
)

litellm/budget_manager.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
from typing import Literal, Optional
1515

1616
import litellm
17+
from litellm.constants import (
18+
DAYS_IN_A_MONTH,
19+
DAYS_IN_A_WEEK,
20+
DAYS_IN_A_YEAR,
21+
HOURS_IN_A_DAY,
22+
)
1723
from litellm.utils import ModelResponse
1824

1925

@@ -81,11 +87,11 @@ def create_budget(
8187
if duration == "daily":
8288
duration_in_days = 1
8389
elif duration == "weekly":
84-
duration_in_days = 7
90+
duration_in_days = DAYS_IN_A_WEEK
8591
elif duration == "monthly":
86-
duration_in_days = 28
92+
duration_in_days = DAYS_IN_A_MONTH
8793
elif duration == "yearly":
88-
duration_in_days = 365
94+
duration_in_days = DAYS_IN_A_YEAR
8995
else:
9096
raise ValueError(
9197
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@@ -182,7 +188,9 @@ def reset_on_duration(self, user: str):
182188
current_time = time.time()
183189

184190
# Convert duration from days to seconds
185-
duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
191+
duration_in_seconds = (
192+
self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
193+
)
186194

187195
# Check if duration has elapsed
188196
if current_time - last_updated_at >= duration_in_seconds:

litellm/caching/caching.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import litellm
2121
from litellm._logging import verbose_logger
22+
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
2223
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
2324
from litellm.types.caching import *
2425
from litellm.types.utils import all_litellm_params
@@ -406,7 +407,7 @@ def generate_streaming_content(self, content):
406407
}
407408
]
408409
}
409-
time.sleep(0.02)
410+
time.sleep(CACHED_STREAMING_CHUNK_DELAY)
410411

411412
def _get_cache_logic(
412413
self,

litellm/caching/in_memory_cache.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515

1616
from pydantic import BaseModel
1717

18-
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
18+
from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
19+
1920
from .base_cache import BaseCache
2021

2122

@@ -52,7 +53,8 @@ def check_value_size(self, value: Any):
5253
# Fast path for common primitive types that are typically small
5354
if (
5455
isinstance(value, (bool, int, float, str))
55-
and len(str(value)) < self.max_size_per_item * 512
56+
and len(str(value))
57+
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
5658
): # Conservative estimate
5759
return True
5860

litellm/caching/qdrant_semantic_cache.py

+23-11
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@
1111
import ast
1212
import asyncio
1313
import json
14-
from typing import Any
14+
from typing import Any, cast
1515

1616
import litellm
1717
from litellm._logging import print_verbose
18+
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
19+
from litellm.types.utils import EmbeddingResponse
1820

1921
from .base_cache import BaseCache
2022

@@ -118,7 +120,11 @@ def __init__( # noqa: PLR0915
118120
}
119121
elif quantization_config == "scalar":
120122
quantization_params = {
121-
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
123+
"scalar": {
124+
"type": "int8",
125+
"quantile": QDRANT_SCALAR_QUANTILE,
126+
"always_ram": False,
127+
}
122128
}
123129
elif quantization_config == "product":
124130
quantization_params = {
@@ -132,7 +138,7 @@ def __init__( # noqa: PLR0915
132138
new_collection_status = self.sync_client.put(
133139
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
134140
json={
135-
"vectors": {"size": 1536, "distance": "Cosine"},
141+
"vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
136142
"quantization_config": quantization_params,
137143
},
138144
headers=self.headers,
@@ -171,10 +177,13 @@ def set_cache(self, key, value, **kwargs):
171177
prompt += message["content"]
172178

173179
# create an embedding for prompt
174-
embedding_response = litellm.embedding(
175-
model=self.embedding_model,
176-
input=prompt,
177-
cache={"no-store": True, "no-cache": True},
180+
embedding_response = cast(
181+
EmbeddingResponse,
182+
litellm.embedding(
183+
model=self.embedding_model,
184+
input=prompt,
185+
cache={"no-store": True, "no-cache": True},
186+
),
178187
)
179188

180189
# get the embedding
@@ -212,10 +221,13 @@ def get_cache(self, key, **kwargs):
212221
prompt += message["content"]
213222

214223
# convert to embedding
215-
embedding_response = litellm.embedding(
216-
model=self.embedding_model,
217-
input=prompt,
218-
cache={"no-store": True, "no-cache": True},
224+
embedding_response = cast(
225+
EmbeddingResponse,
226+
litellm.embedding(
227+
model=self.embedding_model,
228+
input=prompt,
229+
cache={"no-store": True, "no-cache": True},
230+
),
219231
)
220232

221233
# get the embedding

litellm/constants.py

+70
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,79 @@
99
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
1010
)
1111
DEFAULT_MAX_TOKENS = 4096
12+
DEFAULT_ALLOWED_FAILS = 3
1213
DEFAULT_REDIS_SYNC_INTERVAL = 1
1314
DEFAULT_COOLDOWN_TIME_SECONDS = 5
1415
DEFAULT_REPLICATE_POLLING_RETRIES = 5
1516
DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
1617
DEFAULT_IMAGE_TOKEN_COUNT = 250
1718
DEFAULT_IMAGE_WIDTH = 300
1819
DEFAULT_IMAGE_HEIGHT = 300
20+
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
1921
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
2022
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
2123
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
2224
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
2325
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
26+
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
27+
1024 # minimum number of tokens to cache a prompt by Anthropic
28+
)
29+
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
30+
HOURS_IN_A_DAY = 24
31+
DAYS_IN_A_WEEK = 7
32+
DAYS_IN_A_MONTH = 28
33+
DAYS_IN_A_YEAR = 365
34+
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
35+
#### TOKEN COUNTING ####
36+
FUNCTION_DEFINITION_TOKEN_COUNT = 9
37+
SYSTEM_MESSAGE_TOKEN_COUNT = 4
38+
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
39+
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
40+
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
41+
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
42+
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
43+
MAX_TILE_WIDTH = 512
44+
MAX_TILE_HEIGHT = 512
45+
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
46+
MIN_NON_ZERO_TEMPERATURE = 0.0001
2447
#### RELIABILITY ####
2548
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
49+
DEFAULT_MAX_LRU_CACHE_SIZE = 16
50+
INITIAL_RETRY_DELAY = 0.5
51+
MAX_RETRY_DELAY = 8.0
52+
JITTER = 0.75
53+
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
54+
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
55+
AZURE_OPERATION_POLLING_TIMEOUT = 120
56+
REDIS_SOCKET_TIMEOUT = 0.1
57+
REDIS_CONNECTION_POOL_TIMEOUT = 5
58+
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
59+
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
60+
BEDROCK_MAX_POLICY_SIZE = 75
61+
REPLICATE_POLLING_DELAY_SECONDS = 0.5
62+
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
63+
TOGETHER_AI_4_B = 4
64+
TOGETHER_AI_8_B = 8
65+
TOGETHER_AI_21_B = 21
66+
TOGETHER_AI_41_B = 41
67+
TOGETHER_AI_80_B = 80
68+
TOGETHER_AI_110_B = 110
69+
TOGETHER_AI_EMBEDDING_150_M = 150
70+
TOGETHER_AI_EMBEDDING_350_M = 350
71+
QDRANT_SCALAR_QUANTILE = 0.99
72+
QDRANT_VECTOR_SIZE = 1536
73+
CACHED_STREAMING_CHUNK_DELAY = 0.02
74+
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
75+
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
2676
#### Networking settings ####
2777
request_timeout: float = 6000 # time in seconds
2878
STREAM_SSE_DONE_STRING: str = "[DONE]"
79+
### SPEND TRACKING ###
80+
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
81+
FIREWORKS_AI_56_B_MOE = 56
82+
FIREWORKS_AI_176_B_MOE = 176
83+
FIREWORKS_AI_16_B = 16
84+
FIREWORKS_AI_80_B = 80
2985

3086
LITELLM_CHAT_PROVIDERS = [
3187
"openai",
@@ -426,6 +482,9 @@
426482
MAX_SPENDLOG_ROWS_TO_QUERY = (
427483
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
428484
)
485+
DEFAULT_SOFT_BUDGET = (
486+
50.0 # by default all litellm proxy keys have a soft budget of 50.0
487+
)
429488
# makes it clear this is a rate limit error for a litellm virtual key
430489
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
431490

@@ -451,3 +510,14 @@
451510
########################### DB CRON JOB NAMES ###########################
452511
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
453512
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
513+
PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
514+
PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
515+
PROXY_BATCH_WRITE_AT = 10 # in seconds
516+
DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes
517+
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
518+
DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint
519+
DEFAULT_SLACK_ALERTING_THRESHOLD = 300
520+
MAX_TEAM_LIST_LIMIT = 20
521+
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
522+
LENGTH_OF_LITELLM_GENERATED_KEY = 16
523+
SECRET_MANAGER_REFRESH_INTERVAL = 86400

litellm/cost_calculator.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
import litellm
1010
import litellm._logging
1111
from litellm import verbose_logger
12+
from litellm.constants import (
13+
DEFAULT_MAX_LRU_CACHE_SIZE,
14+
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
15+
)
1216
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
1317
StandardBuiltInToolCostTracking,
1418
)
@@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915
355359
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
356360
# see https://replicate.com/pricing
357361
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
358-
a100_80gb_price_per_second_public = (
359-
0.001400 # assume all calls sent to A100 80GB for now
360-
)
362+
a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now
361363
if total_time == 0.0: # total time is in ms
362364
start_time = completion_response.get("created", time.time())
363365
end_time = getattr(completion_response, "ended", time.time())
@@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
450452
return return_model
451453

452454

453-
@lru_cache(maxsize=16)
455+
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
454456
def _model_contains_known_llm_provider(model: str) -> bool:
455457
"""
456458
Check if the model contains a known llm provider

litellm/integrations/SlackAlerting/slack_alerting.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import litellm.types
1717
from litellm._logging import verbose_logger, verbose_proxy_logger
1818
from litellm.caching.caching import DualCache
19+
from litellm.constants import HOURS_IN_A_DAY
1920
from litellm.integrations.custom_batch_logger import CustomBatchLogger
2021
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
2122
from litellm.litellm_core_utils.exception_mapping_utils import (
@@ -649,10 +650,10 @@ async def budget_alerts( # noqa: PLR0915
649650
event_message += (
650651
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
651652
)
652-
elif percent_left <= 0.05:
653+
elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
653654
event = "threshold_crossed"
654655
event_message += "5% Threshold Crossed "
655-
elif percent_left <= 0.15:
656+
elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
656657
event = "threshold_crossed"
657658
event_message += "15% Threshold Crossed"
658659
elif user_info.soft_budget is not None:
@@ -1718,7 +1719,7 @@ async def send_monthly_spend_report(self):
17181719
await self.internal_usage_cache.async_set_cache(
17191720
key=_event_cache_key,
17201721
value="SENT",
1721-
ttl=(30 * 24 * 60 * 60), # 1 month
1722+
ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
17221723
)
17231724

17241725
except Exception as e:

litellm/integrations/datadog/datadog.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from ..additional_logging_utils import AdditionalLoggingUtils
4242

4343
# max number of logs DD API can accept
44-
DD_MAX_BATCH_SIZE = 1000
44+
4545

4646
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
4747
DD_LOGGED_SUCCESS_SERVICE_TYPES = [

0 commit comments

Comments
 (0)