From 092d2cb6f3663159d9a0e0dcb9445acec2677678 Mon Sep 17 00:00:00 2001 From: Tanzim Mokammel Date: Wed, 12 Mar 2025 18:01:38 -0400 Subject: [PATCH 1/3] fix: add support for direct file bytes --- cohere/compass/clients/parser.py | 62 +++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/cohere/compass/clients/parser.py b/cohere/compass/clients/parser.py index 0d3b9bc..12bf09c 100644 --- a/cohere/compass/clients/parser.py +++ b/cohere/compass/clients/parser.py @@ -247,13 +247,6 @@ def process_file( if doc.errors: logger.error(f"Error opening document: {doc.errors}") return [] - if len(doc.filebytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: - max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000 - logger.error( - f"File too large, supported file size is {max_size_mb} mb, " - f"filename {doc.metadata.filename}" - ) - return [] parser_config = parser_config or self.parser_config metadata_config = metadata_config or self.metadata_config @@ -265,6 +258,59 @@ def process_file( content_type=content_type, ) + return self._process_file_bytes( + params=params, + filename=filename, + file_bytes=file_bytes, + custom_context=custom_context, + ) + + @retry( + stop=stop_after_attempt(DEFAULT_MAX_RETRIES), + wait=wait_fixed(DEFAULT_SLEEP_RETRY_SECONDS), + retry=retry_if_not_exception_type((InvalidSchema, CompassClientError)), + ) + def process_file( + self, + *, + filename: str, + file_bytes: bytes, + file_id: Optional[str] = None, + content_type: Optional[str] = None, + parser_config: Optional[ParserConfig] = None, + metadata_config: Optional[MetadataConfig] = None, + custom_context: Optional[Fn_or_Dict] = None, + ) -> list[CompassDocument]: + parser_config = parser_config or self.parser_config + metadata_config = metadata_config or self.metadata_config + params = ProcessFileParameters( + parser_config=parser_config, + metadata_config=metadata_config, + doc_id=file_id, + content_type=content_type, + ) + return self._process_file_bytes( + params=params, + filename=filename, + file_bytes=file_bytes, + custom_context=custom_context, + ) + + def _process_file_bytes( + self, + *, + params: ProcessFileParameters, + filename: str, + file_bytes: bytes, + custom_context: Fn_or_Dict | None = None, + ) -> list[CompassDocument]: + if len(file_bytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: + max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000 + logger.error( + f"File too large, supported file size is {max_size_mb} mb" + + f"filename {filename}" + ) + return [] headers = None if self.bearer_token: headers = {"Authorization": f"Bearer {self.bearer_token}"} @@ -272,7 +318,7 @@ def process_file( res = self.session.post( url=f"{self.parser_url}/v1/process_file", data={"data": json.dumps(params.model_dump())}, - files={"file": (filename, doc.filebytes)}, + files={"file": (filename, file_bytes)}, headers=headers, ) From 6882875d706e399e2355509246d401465e084436 Mon Sep 17 00:00:00 2001 From: Tanzim Mokammel Date: Wed, 12 Mar 2025 18:03:48 -0400 Subject: [PATCH 2/3] . --- cohere/compass/clients/parser.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/cohere/compass/clients/parser.py b/cohere/compass/clients/parser.py index 12bf09c..b19a73b 100644 --- a/cohere/compass/clients/parser.py +++ b/cohere/compass/clients/parser.py @@ -270,7 +270,7 @@ def process_file( wait=wait_fixed(DEFAULT_SLEEP_RETRY_SECONDS), retry=retry_if_not_exception_type((InvalidSchema, CompassClientError)), ) - def process_file( + def process_file_bytes( self, *, filename: str, @@ -281,6 +281,31 @@ def process_file( metadata_config: Optional[MetadataConfig] = None, custom_context: Optional[Fn_or_Dict] = None, ) -> list[CompassDocument]: + """ + Process a file. + + The method takes in a file, its id, its byte array, + and the parser/metadata config. + If the config is None, then it uses the default configs passed by parameter when + creating the client. This makes the CompassParserClient stateful for + convenience, that is, one can pass in the parser/metadata config only once when + creating the CompassParserClient, and process files without having to pass the + config every time. + + :param filename: Filename to process. + :param file_bytes: byte content of the file + :param file_id: Id for the file. + :param content_type: Content type of the file. + :param parser_config: ParserConfig object with the config to use for parsing the + file. + :param metadata_config: MetadataConfig object with the config to use for + extracting metadata for each document. + :param custom_context: Additional data to add to compass document. Fields will + be filterable but not semantically searchable. Can either be a dictionary + or a callable that takes a CompassDocument and returns a dictionary. + + :returns: List of resulting documents + """ parser_config = parser_config or self.parser_config metadata_config = metadata_config or self.metadata_config params = ProcessFileParameters( From 8d0a664264e51ad35d15d4c55f04a22ca5f84773 Mon Sep 17 00:00:00 2001 From: Tanzim Mokammel Date: Wed, 12 Mar 2025 18:12:47 -0400 Subject: [PATCH 3/3] . --- cohere/compass/clients/parser.py | 47 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/cohere/compass/clients/parser.py b/cohere/compass/clients/parser.py index b19a73b..42f6deb 100644 --- a/cohere/compass/clients/parser.py +++ b/cohere/compass/clients/parser.py @@ -248,20 +248,15 @@ def process_file( logger.error(f"Error opening document: {doc.errors}") return [] - parser_config = parser_config or self.parser_config - metadata_config = metadata_config or self.metadata_config - - params = ProcessFileParameters( - parser_config=parser_config, - metadata_config=metadata_config, - doc_id=file_id, - content_type=content_type, - ) - return self._process_file_bytes( - params=params, + params=self._get_file_params( + parser_config=parser_config, + metadata_config=metadata_config, + file_id=file_id, + content_type=content_type, + ), filename=filename, - file_bytes=file_bytes, + file_bytes=doc.filebytes, custom_context=custom_context, ) @@ -306,20 +301,34 @@ def process_file_bytes( :returns: List of resulting documents """ + return self._process_file_bytes( + params=self._get_file_params( + parser_config=parser_config, + metadata_config=metadata_config, + file_id=file_id, + content_type=content_type, + ), + filename=filename, + file_bytes=file_bytes, + custom_context=custom_context, + ) + + def _get_file_params( + self, + *, + parser_config: Optional[ParserConfig] = None, + metadata_config: Optional[MetadataConfig] = None, + file_id: Optional[str] = None, + content_type: Optional[str] = None, + ): parser_config = parser_config or self.parser_config metadata_config = metadata_config or self.metadata_config - params = ProcessFileParameters( + return ProcessFileParameters( parser_config=parser_config, metadata_config=metadata_config, doc_id=file_id, content_type=content_type, ) - return self._process_file_bytes( - params=params, - filename=filename, - file_bytes=file_bytes, - custom_context=custom_context, - ) def _process_file_bytes( self,