weaviate
diff --git a/‎docs/Examples/data_analysis.md‎
Lines changed: 93 additions & 412 deletions b/‎docs/Examples/data_analysis.md‎
Lines changed: 93 additions & 412 deletions
diff --git a/‎docs/Examples/old_data_analysis.md‎
Lines changed: 463 additions & 0 deletions b/‎docs/Examples/old_data_analysis.md‎
Lines changed: 463 additions & 0 deletions
diff --git a/‎docs/creating_tools.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/creating_tools.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/img/diabetes-regression-example.png‎
16.2 KB b/‎docs/img/diabetes-regression-example.png‎
16.2 KB
diff --git a/‎elysia/objects.py‎
Lines changed: 18 additions & 5 deletions b/‎elysia/objects.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎elysia/preprocessing/collection.py‎
Lines changed: 26 additions & 13 deletions b/‎elysia/preprocessing/collection.py‎
Lines changed: 26 additions & 13 deletions
diff --git a/‎elysia/util/client.py‎
Lines changed: 26 additions & 1 deletion b/‎elysia/util/client.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎elysia/util/collection.py‎
Lines changed: 5 additions & 0 deletions b/‎elysia/util/collection.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/no_reqs/general/test_tools_nr.py‎
Lines changed: 66 additions & 0 deletions b/‎tests/no_reqs/general/test_tools_nr.py‎
Lines changed: 66 additions & 0 deletions
@@ -123,7 +123,7 @@ Finally, tools can interact with Elysia's environment, LMs and the Weaviate clie
 ```python
 @tool
 async def some_tool(
-    tree_data, base_lm, complex_lm, tree_data, # these inputs are automatically assigned as Elysia variables
+    tree_data, base_lm, complex_lm, client_manager, # these inputs are automatically assigned as Elysia variables
     x: str, y: int # these inputs are not assigned automatically and get assigned by the decision agent
 ):
     # do something
 
@@ -373,6 +373,18 @@ def return_mapping(result, inputs: dict):
 
         class ToolClass(Tool):
             def __init__(self, **kwargs):
+                self._original_function = function
+                self._original_function_args = {
+                    arg: None
+                    for arg in function.__code__.co_varnames[
+                        : function.__code__.co_argcount
+                    ]
+                }
+                for arg in function.__annotations__:
+                    if arg in self._original_function_args:
+                        self._original_function_args[arg] = function.__annotations__[
+                            arg
+                        ]
                 super().__init__(
                     name=function.__name__,
                     description=function.__doc__ or "",
@@ -384,12 +396,14 @@ def __init__(self, **kwargs):
                     inputs={
                         input_key: {
                             "description": "",
-                            "type": input_value,
+                            "type": (
+                                "Not specified" if input_value is None else input_value
+                            ),
                             "default": defaults_mapping.get(input_key, None),
                             "required": defaults_mapping.get(input_key, None)
                             is not None,
                         }
-                        for input_key, input_value in function.__annotations__.items()
+                        for input_key, input_value in self._original_function_args.items()
                         if input_key
                         not in [
                             "tree_data",
@@ -401,7 +415,6 @@ def __init__(self, **kwargs):
                     },
                     end=end,
                 )
-                self._original_function = function
 
             async def __call__(
                 self, tree_data, inputs, base_lm, complex_lm, client_manager, **kwargs
@@ -420,7 +433,7 @@ async def __call__(
                                     "client_manager": client_manager,
                                     **kwargs,
                                 }.items()
-                                if k in function.__annotations__
+                                if k in self._original_function_args
                             },
                         )
                     ]
@@ -437,7 +450,7 @@ async def __call__(
                                 "client_manager": client_manager,
                                 **kwargs,
                             }.items()
-                            if k in function.__annotations__
+                            if k in self._original_function_args
                         },
                     ):
                         results.append(result)
 
@@ -368,7 +368,7 @@ async def preprocess_async(
     collection_name: str,
     client_manager: ClientManager | None = None,
     min_sample_size: int = 10,
-    max_sample_size: int = 20,
+    max_sample_size: int | None = None,
     num_sample_tokens: int = 30000,
     force: bool = False,
     percentage_correct_threshold: float = 0.3,
@@ -443,9 +443,19 @@ async def preprocess_async(
         agg = await collection.aggregate.over_all(total_count=True)
         len_collection: int = agg.total_count  # type: ignore
 
+        if max_sample_size is None and len_collection > 50_000:
+            max_sample_size = 20
+            logger.warning(
+                f"Collection is large (greater than 50,000 objects), causing slowdown in pre-processing. "
+                f"Reducing maximum sample size to {max_sample_size} objects. "
+                "To override this, set `max_sample_size` as an argument to preprocess."
+            )
+        elif max_sample_size is None:
+            max_sample_size = 50
+
         # Randomly sample sample_size objects for the summary
         indices = random.sample(
-            range(len_collection),
+            range(min(99_999, len_collection)),
             max(min(max_sample_size, len_collection), 1),
         )
 
@@ -455,17 +465,20 @@ async def preprocess_async(
         subset_objects: list[dict] = [obj.objects[0].properties]  # type: ignore
 
         # Get number of objects to sample to get close to num_sample_tokens
-        num_sample_objects = max(min_sample_size, num_sample_tokens // token_count_0)
-
-        for index in indices[1:num_sample_objects]:
-            obj = await collection.query.fetch_objects(limit=1, offset=index)
-            subset_objects.append(obj.objects[0].properties)  # type: ignore
+        num_sample_objects = min(
+            max(min_sample_size, num_sample_tokens // token_count_0),
+            max_sample_size,
+        )
 
         # Estimate number of tokens
         logger.debug(
-            f"Estimated token count of sample: {token_count_0*len(subset_objects)}"
+            f"Estimated token count of sample: {token_count_0*num_sample_objects}"
         )
-        logger.debug(f"Number of objects in sample: {len(subset_objects)}")
+        logger.debug(f"Number of objects in sample: {num_sample_objects}")
+
+        for index in indices[1:num_sample_objects]:
+            obj = await collection.query.fetch_objects(limit=1, offset=index)
+            subset_objects.append(obj.objects[0].properties)  # type: ignore
 
         # Summarise the collection using LLM and the subset of the data
         summary, field_descriptions = await _summarise_collection(
@@ -481,7 +494,7 @@ async def preprocess_async(
             message="Generated summary of collection",
         )
 
-        if len_collection > max_sample_size:
+        if len_collection > 10_000:  # arbitrary cutoff for estimating field statistics
             full_response = subset_objects
         else:
             weaviate_resp = await collection.query.fetch_objects(limit=len_collection)
@@ -782,7 +795,7 @@ async def _preprocess_async(
     collection_names: list[str] | str,
     client_manager: ClientManager | None = None,
     min_sample_size: int = 10,
-    max_sample_size: int = 20,
+    max_sample_size: int | None = None,
     num_sample_tokens: int = 30000,
     settings: Settings = environment_settings,
     force: bool = False,
@@ -860,8 +873,8 @@ async def _preprocess_async(
 def preprocess(
     collection_names: str | list[str],
     client_manager: ClientManager | None = None,
-    min_sample_size: int = 5,
-    max_sample_size: int = 100,
+    min_sample_size: int = 10,
+    max_sample_size: int | None = None,
     num_sample_tokens: int = 30000,
     settings: Settings = environment_settings,
     force: bool = False,
 
@@ -8,7 +8,8 @@
 from logging import Logger
 
 import weaviate
-from weaviate.classes.init import Auth
+from weaviate.classes.init import Auth, Timeout
+from weaviate.config import AdditionalConfig
 from weaviate.client import WeaviateClient, WeaviateAsyncClient
 from elysia.config import settings as environment_settings, Settings
 
@@ -67,6 +68,9 @@ def __init__(
         client_timeout: datetime.timedelta | int | None = None,
         logger: Logger | None = None,
         settings: Settings | None = None,
+        query_timeout: int = 60,
+        insert_timeout: int = 120,
+        init_timeout: int = 5,
         **kwargs,
     ) -> None:
         """
@@ -76,6 +80,9 @@ def __init__(
             client_timeout (datetime.timedelta | int | None): how long (in minutes) means the client should be restarted. Defaults to 3 minutes.
             logger (Logger | None): a logger object for logging messages. Defaults to None.
             settings (Settings | None): a settings object for the client manager. Defaults to environment settings.
+            query_timeout (int): the timeout for Weaviate queries. Defaults to 60 seconds (Weaviate default is 30 seconds).
+            insert_timeout (int): the timeout for Weaviate inserts. Defaults to 120 seconds (Weaviate default is 90 seconds).
+            init_timeout (int): the timeout for Weaviate initialisation. Defaults to 5 seconds (Weaviate default is 2 seconds).
             **kwargs (Any): any other api keys for third party services (formatted as e.g. OPENAI_APIKEY).
 
         Example:
@@ -116,6 +123,10 @@ def __init__(
         else:
             self.wcd_api_key = wcd_api_key
 
+        self.query_timeout = query_timeout
+        self.insert_timeout = insert_timeout
+        self.init_timeout = init_timeout
+
         # Set the api keys for non weaviate cluster (third parties)
         self.headers = {}
         for api_key in self.settings.API_KEYS:
@@ -244,6 +255,13 @@ def get_client(self) -> WeaviateClient:
             auth_credentials=Auth.api_key(self.wcd_api_key),
             headers=self.headers,
             skip_init_checks=True,
+            additional_config=AdditionalConfig(
+                timeout=Timeout(
+                    query=self.query_timeout,
+                    insert=self.insert_timeout,
+                    init=self.init_timeout,
+                )
+            ),
         )
 
     async def get_async_client(self) -> WeaviateAsyncClient:
@@ -255,6 +273,13 @@ async def get_async_client(self) -> WeaviateAsyncClient:
             auth_credentials=Auth.api_key(self.wcd_api_key),
             headers=self.headers,
             skip_init_checks=True,
+            additional_config=AdditionalConfig(
+                timeout=Timeout(
+                    query=self.query_timeout,
+                    insert=self.insert_timeout,
+                    init=self.init_timeout,
+                )
+            ),
         )
 
     @contextmanager
 
@@ -119,6 +119,11 @@ async def paginated_collection(
 ):
     collection = client.collections.get(collection_name)
 
+    if (page_size * (page_number - 1) + page_size) > 99_999:
+        raise ValueError(
+            "Page size exceeds Weaviate's limit of 100,000 objects for using offset."
+        )
+
     filter_type = filter_config.get("type", "all")
     filters_list = filter_config.get("filters", [])
     filters = [f["field"] for f in filters_list]
 
@@ -447,6 +447,72 @@ async def example_async_decorator_tool_from_tree():
     assert "example_async_decorator_tool_from_tree" in tree.tools
 
 
+def test_decorator_tool_typed_inputs():
+
+    tree = Tree()
+
+    @tool(tree=tree)
+    async def example_decorator_tool(x: int, y: int):
+        return x + y
+
+    assert "example_decorator_tool" in tree.tools
+    assert "x" in tree.tools["example_decorator_tool"].inputs
+    assert "y" in tree.tools["example_decorator_tool"].inputs
+    assert tree.tools["example_decorator_tool"].inputs["x"]["type"] is int
+    assert tree.tools["example_decorator_tool"].inputs["y"]["type"] is int
+
+
+def test_decorator_tool_typed_inputs_with_default_inputs():
+
+    tree = Tree()
+
+    @tool(tree=tree)
+    async def example_decorator_tool(x: int = 1, y: int = 2):
+        return x + y
+
+    assert "example_decorator_tool" in tree.tools
+
+    assert "x" in tree.tools["example_decorator_tool"].inputs
+    assert "y" in tree.tools["example_decorator_tool"].inputs
+    assert tree.tools["example_decorator_tool"].inputs["x"]["type"] is int
+    assert tree.tools["example_decorator_tool"].inputs["y"]["type"] is int
+    assert tree.tools["example_decorator_tool"].inputs["x"]["default"] == 1
+    assert tree.tools["example_decorator_tool"].inputs["y"]["default"] == 2
+
+
+def test_decorator_tool_untyped_inputs():
+
+    tree = Tree()
+
+    @tool(tree=tree)
+    async def example_decorator_tool(x, y):
+        return x + y
+
+    assert "example_decorator_tool" in tree.tools
+    assert "x" in tree.tools["example_decorator_tool"].inputs
+    assert "y" in tree.tools["example_decorator_tool"].inputs
+    assert tree.tools["example_decorator_tool"].inputs["x"]["type"] == "Not specified"
+    assert tree.tools["example_decorator_tool"].inputs["y"]["type"] == "Not specified"
+
+
+def test_decorator_with_elysia_inputs():
+    tree = Tree()
+
+    @tool(tree=tree)
+    async def example_decorator_tool(
+        x: int, y: int, tree_data, base_lm, complex_lm, client_manager
+    ):
+        return x + y
+
+    assert "example_decorator_tool" in tree.tools
+    assert "x" in tree.tools["example_decorator_tool"].inputs
+    assert "y" in tree.tools["example_decorator_tool"].inputs
+    assert "tree_data" not in tree.tools["example_decorator_tool"].inputs
+    assert "base_lm" not in tree.tools["example_decorator_tool"].inputs
+    assert "complex_lm" not in tree.tools["example_decorator_tool"].inputs
+    assert "client_manager" not in tree.tools["example_decorator_tool"].inputs
+
+
 @pytest.mark.asyncio
 async def test_add_tool_with_stem_tool():
     tree = Tree(