diff --git a/cohere/compass/clients/parser.py b/cohere/compass/clients/parser.py index 6a4e75f..c297a43 100644 --- a/cohere/compass/clients/parser.py +++ b/cohere/compass/clients/parser.py @@ -254,24 +254,104 @@ def process_file( if doc.errors: logger.error(f"Error opening document: {doc.errors}") return [] - if len(doc.filebytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: - max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000 - logger.error( - f"File too large, supported file size is {max_size_mb} mb, " - f"filename {doc.metadata.filename}" - ) - return [] + return self._process_file_bytes( + params=self._get_file_params( + parser_config=parser_config, + metadata_config=metadata_config, + file_id=file_id, + content_type=content_type, + ), + filename=filename, + file_bytes=doc.filebytes, + custom_context=custom_context, + ) + + @retry( + stop=stop_after_attempt(DEFAULT_MAX_RETRIES), + wait=wait_fixed(DEFAULT_SLEEP_RETRY_SECONDS), + retry=retry_if_not_exception_type((InvalidSchema, CompassClientError)), + ) + def process_file_bytes( + self, + *, + filename: str, + file_bytes: bytes, + file_id: Optional[str] = None, + content_type: Optional[str] = None, + parser_config: Optional[ParserConfig] = None, + metadata_config: Optional[MetadataConfig] = None, + custom_context: Optional[Fn_or_Dict] = None, + ) -> list[CompassDocument]: + """ + Process a file. + + The method takes in a file, its id, its byte array, + and the parser/metadata config. + If the config is None, then it uses the default configs passed by parameter when + creating the client. This makes the CompassParserClient stateful for + convenience, that is, one can pass in the parser/metadata config only once when + creating the CompassParserClient, and process files without having to pass the + config every time. + + :param filename: filename to process. + :param file_bytes: byte content of the file + :param file_id: ID for the file. + :param content_type: Content type of the file. + :param parser_config: ParserConfig object with the config to use for parsing the + file. + :param metadata_config: MetadataConfig object with the config to use for + extracting metadata for each document. + :param custom_context: Additional data to add to compass document. Fields will + be filterable but not semantically searchable. Can either be a dictionary + or a callable that takes a CompassDocument and returns a dictionary. + + :returns: List of resulting documents + """ + return self._process_file_bytes( + params=self._get_file_params( + parser_config=parser_config, + metadata_config=metadata_config, + file_id=file_id, + content_type=content_type, + ), + filename=filename, + file_bytes=file_bytes, + custom_context=custom_context, + ) + + def _get_file_params( + self, + *, + parser_config: Optional[ParserConfig] = None, + metadata_config: Optional[MetadataConfig] = None, + file_id: Optional[str] = None, + content_type: Optional[str] = None, + ): parser_config = parser_config or self.parser_config metadata_config = metadata_config or self.metadata_config - - params = ProcessFileParameters( + return ProcessFileParameters( parser_config=parser_config, metadata_config=metadata_config, doc_id=file_id, content_type=content_type, ) + def _process_file_bytes( + self, + *, + params: ProcessFileParameters, + filename: str, + file_bytes: bytes, + custom_context: Optional[Fn_or_Dict] = None, + ) -> list[CompassDocument]: + if len(file_bytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: + max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000 + logger.error( + f"File too large, supported file size is {max_size_mb} mb" + + f"filename {filename}" + ) + return [] headers = None if self.bearer_token: headers = {"Authorization": f"Bearer {self.bearer_token}"} @@ -279,7 +359,7 @@ def process_file( res = self.session.post( url=f"{self.parser_url}/v1/process_file", data={"data": json.dumps(params.model_dump())}, - files={"file": (filename, doc.filebytes)}, + files={"file": (filename, file_bytes)}, headers=headers, ) diff --git a/poetry.lock b/poetry.lock index 2e118d5..c7619cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -6,6 +6,7 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -17,6 +18,7 @@ version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main", "dev"] files = [ {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, @@ -28,6 +30,7 @@ version = "3.4.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" +groups = ["main", "dev"] files = [ {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, @@ -142,6 +145,8 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "sys_platform == \"win32\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -153,6 +158,8 @@ version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -167,6 +174,7 @@ version = "2024.12.0" description = "File-system specification" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2"}, {file = "fsspec-2024.12.0.tar.gz", hash = "sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f"}, @@ -206,6 +214,7 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" +groups = ["main", "dev"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -220,6 +229,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -231,6 +241,7 @@ version = "1.4.2" description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, @@ -242,6 +253,7 @@ version = "1.9.1" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] files = [ {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, @@ -253,6 +265,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -264,6 +277,7 @@ version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, @@ -279,6 +293,7 @@ version = "2.10.3" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d"}, {file = "pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9"}, @@ -299,6 +314,7 @@ version = "2.27.1" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pydantic_core-2.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:71a5e35c75c021aaf400ac048dacc855f000bdfed91614b4a726f7432f1f3d6a"}, {file = "pydantic_core-2.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f82d068a2d6ecfc6e054726080af69a6764a10015467d7d7b9f66d6ed5afa23b"}, @@ -411,6 +427,7 @@ version = "1.1.390" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pyright-1.1.390-py3-none-any.whl", hash = "sha256:ecebfba5b6b50af7c1a44c2ba144ba2ab542c227eb49bc1f16984ff714e0e110"}, {file = "pyright-1.1.390.tar.gz", hash = "sha256:aad7f160c49e0fbf8209507a15e17b781f63a86a1facb69ca877c71ef2e9538d"}, @@ -431,6 +448,7 @@ version = "8.3.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, @@ -453,6 +471,7 @@ version = "0.24.0" description = "Pytest support for asyncio" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest_asyncio-0.24.0-py3-none-any.whl", hash = "sha256:a811296ed596b69bf0b6f3dc40f83bcaf341b155a269052d82efa2b25ac7037b"}, {file = "pytest_asyncio-0.24.0.tar.gz", hash = "sha256:d081d828e576d85f875399194281e92bf8a68d60d72d1a2faf2feddb6c46b276"}, @@ -471,6 +490,7 @@ version = "3.14.0" description = "Thin-wrapper around the mock package for easier use with pytest" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"}, {file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"}, @@ -488,6 +508,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -509,6 +530,7 @@ version = "1.12.1" description = "Mock out responses from the requests package" optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "requests-mock-1.12.1.tar.gz", hash = "sha256:e9e12e333b525156e82a3c852f22016b9158220d2f47454de9cae8a77d371401"}, {file = "requests_mock-1.12.1-py2.py3-none-any.whl", hash = "sha256:b1e37054004cdd5e56c84454cc7df12b25f90f382159087f4b6915aaeef39563"}, @@ -526,6 +548,7 @@ version = "0.8.3" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ruff-0.8.3-py3-none-linux_armv6l.whl", hash = "sha256:8d5d273ffffff0acd3db5bf626d4b131aa5a5ada1276126231c4174543ce20d6"}, {file = "ruff-0.8.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e4d66a21de39f15c9757d00c50c8cdd20ac84f55684ca56def7891a025d7e939"}, @@ -553,6 +576,7 @@ version = "8.2.3" description = "Retry code until it succeeds" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "tenacity-8.2.3-py3-none-any.whl", hash = "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"}, {file = "tenacity-8.2.3.tar.gz", hash = "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a"}, @@ -567,6 +591,8 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -608,6 +634,7 @@ version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, @@ -619,6 +646,7 @@ version = "2.2.3" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, @@ -631,6 +659,6 @@ socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.9,<4.0" content-hash = "048fe42dd3e2c20ca877875166bb2968acd13f0d61b96c37d8cea6bb91ee9b7f" diff --git a/pyproject.toml b/pyproject.toml index 69f7935..b267cbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "compass-sdk" -version = "0.19.0" +version = "0.20.0" authors = [] description = "Compass SDK" readme = "README.md" diff --git a/tests/test_parser_client.py b/tests/test_parser_client.py new file mode 100644 index 0000000..8e3f98e --- /dev/null +++ b/tests/test_parser_client.py @@ -0,0 +1,25 @@ +from requests_mock import Mocker + +from cohere.compass.clients import CompassParserClient + + +def test_process_file_bytes(requests_mock: Mocker) -> None: + requests_mock.post("mock://test.com/v1/process_file", json={"docs": []}) + client = CompassParserClient(parser_url="mock://test.com") + client.process_file_bytes(filename="test.pdf", file_bytes=b"0") + assert requests_mock.request_history[0].method == "POST" + assert requests_mock.request_history[0].url == "mock://test.com/v1/process_file" + headers = requests_mock.request_history[0].headers + assert "multipart/form-data" in headers["Content-Type"] + assert "Authorization" not in headers + + +def test_process_file_bytes_with_auth(requests_mock: Mocker) -> None: + requests_mock.post("mock://test.com/v1/process_file", json={"docs": []}) + client = CompassParserClient(parser_url="mock://test.com", bearer_token="secret") + client.process_file_bytes(filename="test.pdf", file_bytes=b"0") + assert requests_mock.request_history[0].method == "POST" + assert requests_mock.request_history[0].url == "mock://test.com/v1/process_file" + headers = requests_mock.request_history[0].headers + assert "multipart/form-data" in headers["Content-Type"] + assert "Authorization" in headers