fix: Allow split page logic to process files concurrently (#175)

awalker4 · web-flow · commit 54fd2eb7d13a · 2024-09-17T15:32:25.000-05:00
# The Issue

`split_pdf_hook.py` does not support multiple concurrent files. This is
because we store the split request tasks in
`self.coroutines_to_execute[operation_id]`, where `operation_id` is just
the string "partition". Therefore, if we send two concurrent docs using
the same SDK, they'll both try to await the same list of coroutines.
This could result in interleaved results, but mostly it breaks with
`RuntimeError: coroutine is being awaited already`, as the second
request gets ready to await its requests. This will block anyone trying
to use the new `partition_async` to fan out their pdfs.

Note that the js/ts client also has this issue.

# The fix

We need to use an actual id to index into `coroutines_to_execute`. In
`before_request`, let's make a uuid and build up the list of coroutines
for this doc. We need to pass this id to `after_success` in order to
retrieve the results, so we can set it as a header on our "dummy"
request that's returned to the SDK.

# Testing

See the new integration test. We can verify this by sending two docs
serially, and then with `asyncio.gather`, and confirm that the results
are the same.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,8 @@
+## 0.26.0-dev
+
+### Enhancements
+
+### Features
+* Add `partition_async` for a non blocking alternative to `partition`
+
+### Fixes
diff --git a/_test_unstructured_client/integration/test_integration_freemium.py b/_test_unstructured_client/integration/test_integration_freemium.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import pytest
+from deepdiff import DeepDiff
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared, operations
 from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
@@ -125,6 +126,70 @@ async def test_partition_async_returns_elements(client, doc_path):
     assert len(response.elements)
 
 
+@pytest.mark.asyncio
+async def test_partition_async_processes_concurrent_files(client, doc_path):
+    """
+    Assert that partition_async can be used to send multiple files concurrently.
+    Send two separate portions of the test doc, serially and then using asyncio.gather.
+    The results for both runs should match.
+    """
+    filename = "layout-parser-paper.pdf"
+
+    with open(doc_path / filename, "rb") as f:
+        files = shared.Files(
+            content=f.read(),
+            file_name=filename,
+        )
+
+    # Set up two SDK requests
+    # For different page ranges
+    requests = [
+        operations.PartitionRequest(
+            partition_parameters=shared.PartitionParameters(
+                files=files,
+                strategy="fast",
+                languages=["eng"],
+                split_pdf_page=True,
+                split_pdf_page_range=[1, 3],
+            )
+        ),
+        operations.PartitionRequest(
+            partition_parameters=shared.PartitionParameters(
+                files=files,
+                strategy="fast",
+                languages=["eng"],
+                split_pdf_page=True,
+                split_pdf_page_range=[10, 12],
+            )
+        )
+    ]
+
+    serial_responses = []
+    for req in requests:
+        res = await client.general.partition_async(request=req)
+
+        assert res.status_code == 200
+        serial_responses.append(res.elements)
+
+    concurrent_responses = []
+    results = await asyncio.gather(
+        client.general.partition_async(request=requests[0]),
+        client.general.partition_async(request=requests[1])
+    )
+
+    for res in results:
+        assert res.status_code == 200
+        concurrent_responses.append(res.elements)
+
+    diff = DeepDiff(
+        t1=serial_responses,
+        t2=concurrent_responses,
+        ignore_order=True,
+    )
+
+    assert len(diff) == 0
+
+
 def test_uvloop_partitions_without_errors(client, doc_path):
     async def call_api():
         filename = "layout-parser-paper-fast.pdf"
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import math
+import uuid
 from collections.abc import Awaitable
 from typing import Any, Coroutine, Optional, Tuple, Union, cast
 
@@ -198,7 +199,12 @@ def before_request(
         # This allows us to use an event loop in an env with an existing loop
         # Temporary fix until we can improve the async splitting behavior
         nest_asyncio.apply()
-        operation_id = hook_ctx.operation_id
+
+        # This is our key into coroutines_to_execute
+        # We need to pass it on to after_success so
+        # we know which results are ours
+        operation_id = str(uuid.uuid4())
+
         content_type = request.headers.get("Content-Type")
 
         request_content = request.read()
@@ -329,16 +335,20 @@ async def call_api_partial(page):
             self.coroutines_to_execute[operation_id].append(coroutine)
             set_index += 1
 
+
         # Return a dummy request for the SDK to use
         # This allows us to skip right to the AfterRequestHook and await all the calls
+        # Also, pass the operation_id so after_success can await the right results
+
         # Note: We need access to the async_client from the sdk_init hook in order to set
         # up a mock request like this.
         # For now, just make an extra request against our api, which should return 200.
         # dummy_request = httpx.Request("GET",  "http://no-op")
-
-        dummy_request = httpx.Request("GET",  "https://api.unstructuredapp.io/general/docs")
-
-        return dummy_request
+        return httpx.Request(
+            "GET",
+            "https://api.unstructuredapp.io/general/docs",
+            headers={"operation_id": operation_id},
+        )
 
     def _await_elements(
             self, operation_id: str) -> Optional[list]:
@@ -407,9 +417,9 @@ def after_success(
             combined response object; otherwise, the original response. Can return
             exception if it ocurred during the execution.
         """
-        operation_id = hook_ctx.operation_id
-        # Because in `before_request` method we skipped sending last page in parallel
-        # we need to pass response, which contains last page, to `_await_elements` method
+        # Grab the correct id out of the dummy request
+        operation_id = response.request.headers.get("operation_id")
+
         elements = self._await_elements(operation_id)
 
         # if fails are disallowed, return the first failed response