BerriAI
diff --git a/‎.devcontainer/devcontainer.json
+2-1 b/‎.devcontainer/devcontainer.json
+2-1
diff --git a/‎.pre-commit-config.yaml
+6-6 b/‎.pre-commit-config.yaml
+6-6
diff --git a/‎litellm/integrations/langfuse.py
+8 b/‎litellm/integrations/langfuse.py
+8
diff --git a/‎litellm/integrations/s3.py
+1-7 b/‎litellm/integrations/s3.py
+1-7
diff --git a/‎litellm/litellm_core_utils/litellm_logging.py
+17-1 b/‎litellm/litellm_core_utils/litellm_logging.py
+17-1
diff --git a/‎litellm/llms/anthropic/chat.py
+34-35 b/‎litellm/llms/anthropic/chat.py
+34-35
diff --git a/‎litellm/llms/bedrock/embed/embedding.py
+1-1 b/‎litellm/llms/bedrock/embed/embedding.py
+1-1
diff --git a/‎litellm/llms/custom_httpx/http_handler.py
+13-18 b/‎litellm/llms/custom_httpx/http_handler.py
+13-18
diff --git a/‎litellm/main.py
+11 b/‎litellm/main.py
+11
diff --git a/‎litellm/model_prices_and_context_window_backup.json
+9 b/‎litellm/model_prices_and_context_window_backup.json
+9
diff --git a/‎litellm/proxy/_new_secret_config.yaml
+11-2 b/‎litellm/proxy/_new_secret_config.yaml
+11-2
@@ -22,7 +22,8 @@
 				"ms-python.python",
 				"ms-python.vscode-pylance",
 				"GitHub.copilot",
-				"GitHub.copilot-chat"
+				"GitHub.copilot-chat",
+				"ms-python.autopep8"
 			]
 		}
 	},
 
@@ -1,12 +1,12 @@
 repos:
 -   repo: local
     hooks:
-    # -   id: mypy
-    #     name: mypy
-    #     entry: python3 -m mypy --ignore-missing-imports
-    #     language: system
-    #     types: [python]
-    #     files: ^litellm/
+    -   id: mypy
+        name: mypy
+        entry: python3 -m mypy --ignore-missing-imports
+        language: system
+        types: [python]
+        files: ^litellm/
     -   id: isort
         name: isort
         entry: isort
 
@@ -208,6 +208,14 @@ def log_event(
             ):
                 input = prompt
                 output = response_obj["text"]
+            elif (
+                kwargs.get("call_type") is not None
+                and kwargs.get("call_type") == "pass_through_endpoint"
+                and response_obj is not None
+                and isinstance(response_obj, dict)
+            ):
+                input = prompt
+                output = response_obj.get("response", "")
             print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
             trace_id = None
             generation_id = None
 
@@ -101,12 +101,6 @@ def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
             metadata = (
                 litellm_params.get("metadata", {}) or {}
             )  # if litellm_params['metadata'] == None
-            messages = kwargs.get("messages")
-            optional_params = kwargs.get("optional_params", {})
-            call_type = kwargs.get("call_type", "litellm.completion")
-            cache_hit = kwargs.get("cache_hit", False)
-            usage = response_obj["usage"]
-            id = response_obj.get("id", str(uuid.uuid4()))
 
             # Clean Metadata before logging - never log raw metadata
             # the raw metadata can contain circular references which leads to infinite recursion
@@ -171,5 +165,5 @@ def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
             print_verbose(f"s3 Layer Logging - final response object: {response_obj}")
             return response
         except Exception as e:
-            verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
+            verbose_logger.exception(f"s3 Layer Error - {str(e)}")
             pass
@@ -41,6 +41,7 @@
     StandardLoggingMetadata,
     StandardLoggingModelInformation,
     StandardLoggingPayload,
+    StandardPassThroughResponseObject,
     TextCompletionResponse,
     TranscriptionResponse,
 )
@@ -534,7 +535,9 @@ def _response_cost_calculator(
         """
         ## RESPONSE COST ##
         custom_pricing = use_custom_pricing_for_model(
-            litellm_params=self.litellm_params
+            litellm_params=(
+                self.litellm_params if hasattr(self, "litellm_params") else None
+            )
         )
 
         if cache_hit is None:
@@ -611,6 +614,17 @@ def _success_handler_helper_fn(
                             ] = result._hidden_params
                     ## STANDARDIZED LOGGING PAYLOAD
 
+                    self.model_call_details["standard_logging_object"] = (
+                        get_standard_logging_object_payload(
+                            kwargs=self.model_call_details,
+                            init_response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            logging_obj=self,
+                        )
+                    )
+                elif isinstance(result, dict):  # pass-through endpoints
+                    ## STANDARDIZED LOGGING PAYLOAD
                     self.model_call_details["standard_logging_object"] = (
                         get_standard_logging_object_payload(
                             kwargs=self.model_call_details,
@@ -2271,6 +2285,8 @@ def get_standard_logging_object_payload(
         elif isinstance(init_response_obj, BaseModel):
             response_obj = init_response_obj.model_dump()
             hidden_params = getattr(init_response_obj, "_hidden_params", None)
+        elif isinstance(init_response_obj, dict):
+            response_obj = init_response_obj
         else:
             response_obj = {}
         # standardize this function to be used across, s3, dynamoDB, langfuse logging
 
@@ -674,7 +674,7 @@ async def make_call(
     timeout: Optional[Union[float, httpx.Timeout]],
 ):
     if client is None:
-        client = _get_async_httpx_client()  # Create a new client if none provided
+        client = litellm.module_level_aclient
 
     try:
         response = await client.post(
@@ -690,11 +690,6 @@ async def make_call(
                 raise e
         raise AnthropicError(status_code=500, message=str(e))
 
-    if response.status_code != 200:
-        raise AnthropicError(
-            status_code=response.status_code, message=await response.aread()
-        )
-
     completion_stream = ModelResponseIterator(
         streaming_response=response.aiter_lines(), sync_stream=False
     )
@@ -721,7 +716,7 @@ def make_sync_call(
     timeout: Optional[Union[float, httpx.Timeout]],
 ):
     if client is None:
-        client = HTTPHandler()  # Create a new client if none provided
+        client = litellm.module_level_client  # re-use a module level client
 
     try:
         response = client.post(
@@ -869,6 +864,7 @@ async def acompletion_stream_function(
         model_response: ModelResponse,
         print_verbose: Callable,
         timeout: Union[float, httpx.Timeout],
+        client: Optional[AsyncHTTPHandler],
         encoding,
         api_key,
         logging_obj,
@@ -882,19 +878,18 @@ async def acompletion_stream_function(
     ):
         data["stream"] = True
 
+        completion_stream = await make_call(
+            client=client,
+            api_base=api_base,
+            headers=headers,
+            data=json.dumps(data),
+            model=model,
+            messages=messages,
+            logging_obj=logging_obj,
+            timeout=timeout,
+        )
         streamwrapper = CustomStreamWrapper(
-            completion_stream=None,
-            make_call=partial(
-                make_call,
-                client=None,
-                api_base=api_base,
-                headers=headers,
-                data=json.dumps(data),
-                model=model,
-                messages=messages,
-                logging_obj=logging_obj,
-                timeout=timeout,
-            ),
+            completion_stream=completion_stream,
             model=model,
             custom_llm_provider="anthropic",
             logging_obj=logging_obj,
@@ -1080,6 +1075,11 @@ def completion(
                     logger_fn=logger_fn,
                     headers=headers,
                     timeout=timeout,
+                    client=(
+                        client
+                        if client is not None and isinstance(client, AsyncHTTPHandler)
+                        else None
+                    ),
                 )
             else:
                 return self.acompletion_function(
@@ -1105,33 +1105,32 @@ def completion(
                 )
         else:
             ## COMPLETION CALL
-            if client is None or not isinstance(client, HTTPHandler):
-                client = HTTPHandler(timeout=timeout)  # type: ignore
-            else:
-                client = client
             if (
                 stream is True
             ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                 data["stream"] = stream
+                completion_stream = make_sync_call(
+                    client=client,
+                    api_base=api_base,
+                    headers=headers,  # type: ignore
+                    data=json.dumps(data),
+                    model=model,
+                    messages=messages,
+                    logging_obj=logging_obj,
+                    timeout=timeout,
+                )
                 return CustomStreamWrapper(
-                    completion_stream=None,
-                    make_call=partial(
-                        make_sync_call,
-                        client=None,
-                        api_base=api_base,
-                        headers=headers,  # type: ignore
-                        data=json.dumps(data),
-                        model=model,
-                        messages=messages,
-                        logging_obj=logging_obj,
-                        timeout=timeout,
-                    ),
+                    completion_stream=completion_stream,
                     model=model,
                     custom_llm_provider="anthropic",
                     logging_obj=logging_obj,
                 )
 
             else:
+                if client is None or not isinstance(client, HTTPHandler):
+                    client = HTTPHandler(timeout=timeout)  # type: ignore
+                else:
+                    client = client
                 response = client.post(
                     api_base, headers=headers, data=json.dumps(data), timeout=timeout
                 )
 
@@ -110,7 +110,7 @@ def _make_sync_call(
             response.raise_for_status()
         except httpx.HTTPStatusError as err:
             error_code = err.response.status_code
-            raise BedrockError(status_code=error_code, message=response.text)
+            raise BedrockError(status_code=error_code, message=err.response.text)
         except httpx.TimeoutException:
             raise BedrockError(status_code=408, message="Timeout error occurred.")
 
 
@@ -37,16 +37,10 @@ def create_client(
 
         # SSL certificates (a.k.a CA bundle) used to verify the identity of requested hosts.
         # /path/to/certificate.pem
-        ssl_verify = os.getenv(
-            "SSL_VERIFY",
-            litellm.ssl_verify
-        )
+        ssl_verify = os.getenv("SSL_VERIFY", litellm.ssl_verify)
         # An SSL certificate used by the requested host to authenticate the client.
         # /path/to/client.pem
-        cert = os.getenv(
-            "SSL_CERTIFICATE",
-            litellm.ssl_certificate
-        )
+        cert = os.getenv("SSL_CERTIFICATE", litellm.ssl_certificate)
 
         if timeout is None:
             timeout = _DEFAULT_TIMEOUT
@@ -277,16 +271,10 @@ def __init__(
 
         # SSL certificates (a.k.a CA bundle) used to verify the identity of requested hosts.
         # /path/to/certificate.pem
-        ssl_verify = os.getenv(
-            "SSL_VERIFY",
-            litellm.ssl_verify
-        )
+        ssl_verify = os.getenv("SSL_VERIFY", litellm.ssl_verify)
         # An SSL certificate used by the requested host to authenticate the client.
         # /path/to/client.pem
-        cert = os.getenv(
-            "SSL_CERTIFICATE",
-            litellm.ssl_certificate
-        )
+        cert = os.getenv("SSL_CERTIFICATE", litellm.ssl_certificate)
 
         if client is None:
             # Create a client with a connection pool
@@ -334,13 +322,21 @@ def post(
                     "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
                 )
             response = self.client.send(req, stream=stream)
+            response.raise_for_status()
             return response
         except httpx.TimeoutException:
             raise litellm.Timeout(
                 message=f"Connection timed out after {timeout} seconds.",
                 model="default-model-name",
                 llm_provider="litellm-httpx-handler",
             )
+        except httpx.HTTPStatusError as e:
+            setattr(e, "status_code", e.response.status_code)
+            if stream is True:
+                setattr(e, "message", e.response.read())
+            else:
+                setattr(e, "message", e.response.text)
+            raise e
         except Exception as e:
             raise e
 
@@ -375,7 +371,6 @@ def put(
         except Exception as e:
             raise e
 
-
     def __del__(self) -> None:
         try:
             self.close()
@@ -437,4 +432,4 @@ def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
         _new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
 
     litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
-    return _new_client
+    return _new_client
@@ -534,6 +534,15 @@ def mock_completion(
                 model=model,  # type: ignore
                 request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
             )
+        elif isinstance(mock_response, str) and mock_response.startswith(
+            "Exception: mock_streaming_error"
+        ):
+            mock_response = litellm.MockException(
+                message="This is a mock error raised mid-stream",
+                llm_provider="anthropic",
+                model=model,
+                status_code=529,
+            )
         time_delay = kwargs.get("mock_delay", None)
         if time_delay is not None:
             time.sleep(time_delay)
@@ -561,6 +570,8 @@ def mock_completion(
                 custom_llm_provider="openai",
                 logging_obj=logging,
             )
+        if isinstance(mock_response, litellm.MockException):
+            raise mock_response
         if n is None:
             model_response.choices[0].message.content = mock_response  # type: ignore
         else:
 
@@ -3408,6 +3408,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "amazon.titan-text-premier-v1:0": {
+        "max_tokens": 32000, 
+        "max_input_tokens": 42000,
+        "max_output_tokens": 32000, 
+        "input_cost_per_token": 0.0000005,
+        "output_cost_per_token": 0.0000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "amazon.titan-embed-text-v1": {
         "max_tokens": 8192, 
         "max_input_tokens": 8192, 
 
@@ -1,7 +1,16 @@
 model_list:
-  - model_name: "*"
+  - model_name: "anthropic/claude-3-5-sonnet-20240620"
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20240620
+      # api_base: http://0.0.0.0:9000 
+  - model_name: gpt-3.5-turbo
     litellm_params:
       model: openai/*
 
 litellm_settings:
-  default_max_internal_user_budget: 2
+  success_callback: ["s3"]
+  s3_callback_params:
+    s3_bucket_name: litellm-logs   # AWS Bucket Name for S3
+    s3_region_name: us-west-2              # AWS Region Name for S3
+    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
+    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,8 @@`
`22`	`22`	`"ms-python.python",`
`23`	`23`	`"ms-python.vscode-pylance",`
`24`	`24`	`"GitHub.copilot",`
`25`		`- "GitHub.copilot-chat"`
	`25`	`+ "GitHub.copilot-chat",`
	`26`	`+ "ms-python.autopep8"`
`26`	`27`	`]`
`27`	`28`	`}`
`28`	`29`	`},`