|
9 | 9 | 0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
10 | 10 | )
|
11 | 11 | DEFAULT_MAX_TOKENS = 4096
|
| 12 | +DEFAULT_ALLOWED_FAILS = 3 |
12 | 13 | DEFAULT_REDIS_SYNC_INTERVAL = 1
|
13 | 14 | DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
14 | 15 | DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
15 | 16 | DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
|
16 | 17 | DEFAULT_IMAGE_TOKEN_COUNT = 250
|
17 | 18 | DEFAULT_IMAGE_WIDTH = 300
|
18 | 19 | DEFAULT_IMAGE_HEIGHT = 300
|
| 20 | +DEFAULT_MAX_TOKENS = 256 # used when providers need a default |
19 | 21 | MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
20 | 22 | SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
21 | 23 | REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
|
22 | 24 | REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
|
23 | 25 | MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
|
| 26 | +MINIMUM_PROMPT_CACHE_TOKEN_COUNT = ( |
| 27 | + 1024 # minimum number of tokens to cache a prompt by Anthropic |
| 28 | +) |
| 29 | +DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt |
| 30 | +HOURS_IN_A_DAY = 24 |
| 31 | +DAYS_IN_A_WEEK = 7 |
| 32 | +DAYS_IN_A_MONTH = 28 |
| 33 | +DAYS_IN_A_YEAR = 365 |
| 34 | +REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64 |
| 35 | +#### TOKEN COUNTING #### |
| 36 | +FUNCTION_DEFINITION_TOKEN_COUNT = 9 |
| 37 | +SYSTEM_MESSAGE_TOKEN_COUNT = 4 |
| 38 | +TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4 |
| 39 | +DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10 |
| 40 | +DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20 |
| 41 | +MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768 |
| 42 | +MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000 |
| 43 | +MAX_TILE_WIDTH = 512 |
| 44 | +MAX_TILE_HEIGHT = 512 |
| 45 | +OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000 |
| 46 | +MIN_NON_ZERO_TEMPERATURE = 0.0001 |
24 | 47 | #### RELIABILITY ####
|
25 | 48 | REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
| 49 | +DEFAULT_MAX_LRU_CACHE_SIZE = 16 |
| 50 | +INITIAL_RETRY_DELAY = 0.5 |
| 51 | +MAX_RETRY_DELAY = 8.0 |
| 52 | +JITTER = 0.75 |
| 53 | +DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache |
| 54 | +DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler |
| 55 | +AZURE_OPERATION_POLLING_TIMEOUT = 120 |
| 56 | +REDIS_SOCKET_TIMEOUT = 0.1 |
| 57 | +REDIS_CONNECTION_POOL_TIMEOUT = 5 |
| 58 | +NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth) |
| 59 | +MAX_EXCEPTION_MESSAGE_LENGTH = 2000 |
| 60 | +BEDROCK_MAX_POLICY_SIZE = 75 |
| 61 | +REPLICATE_POLLING_DELAY_SECONDS = 0.5 |
| 62 | +DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096 |
| 63 | +TOGETHER_AI_4_B = 4 |
| 64 | +TOGETHER_AI_8_B = 8 |
| 65 | +TOGETHER_AI_21_B = 21 |
| 66 | +TOGETHER_AI_41_B = 41 |
| 67 | +TOGETHER_AI_80_B = 80 |
| 68 | +TOGETHER_AI_110_B = 110 |
| 69 | +TOGETHER_AI_EMBEDDING_150_M = 150 |
| 70 | +TOGETHER_AI_EMBEDDING_350_M = 350 |
| 71 | +QDRANT_SCALAR_QUANTILE = 0.99 |
| 72 | +QDRANT_VECTOR_SIZE = 1536 |
| 73 | +CACHED_STREAMING_CHUNK_DELAY = 0.02 |
| 74 | +MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512 |
| 75 | +DEFAULT_MAX_TOKENS_FOR_TRITON = 2000 |
26 | 76 | #### Networking settings ####
|
27 | 77 | request_timeout: float = 6000 # time in seconds
|
28 | 78 | STREAM_SSE_DONE_STRING: str = "[DONE]"
|
| 79 | +### SPEND TRACKING ### |
| 80 | +DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB |
| 81 | +FIREWORKS_AI_56_B_MOE = 56 |
| 82 | +FIREWORKS_AI_176_B_MOE = 176 |
| 83 | +FIREWORKS_AI_16_B = 16 |
| 84 | +FIREWORKS_AI_80_B = 80 |
29 | 85 |
|
30 | 86 | LITELLM_CHAT_PROVIDERS = [
|
31 | 87 | "openai",
|
|
426 | 482 | MAX_SPENDLOG_ROWS_TO_QUERY = (
|
427 | 483 | 1_000_000 # if spendLogs has more than 1M rows, do not query the DB
|
428 | 484 | )
|
| 485 | +DEFAULT_SOFT_BUDGET = ( |
| 486 | + 50.0 # by default all litellm proxy keys have a soft budget of 50.0 |
| 487 | +) |
429 | 488 | # makes it clear this is a rate limit error for a litellm virtual key
|
430 | 489 | RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
|
431 | 490 |
|
|
451 | 510 | ########################### DB CRON JOB NAMES ###########################
|
452 | 511 | DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
|
453 | 512 | DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
|
| 513 | +PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597 |
| 514 | +PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605 |
| 515 | +PROXY_BATCH_WRITE_AT = 10 # in seconds |
| 516 | +DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes |
| 517 | +PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9 |
| 518 | +DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint |
| 519 | +DEFAULT_SLACK_ALERTING_THRESHOLD = 300 |
| 520 | +MAX_TEAM_LIST_LIMIT = 20 |
| 521 | +DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7 |
| 522 | +LENGTH_OF_LITELLM_GENERATED_KEY = 16 |
| 523 | +SECRET_MANAGER_REFRESH_INTERVAL = 86400 |
0 commit comments