Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add support for direct file bytes #101

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 90 additions & 10 deletions cohere/compass/clients/parser.py
Original file line number Diff line number Diff line change
@@ -247,32 +247,112 @@ def process_file(
if doc.errors:
logger.error(f"Error opening document: {doc.errors}")
return []
if len(doc.filebytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES:
max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000
logger.error(
f"File too large, supported file size is {max_size_mb} mb, "
f"filename {doc.metadata.filename}"
)
return []

return self._process_file_bytes(
params=self._get_file_params(
parser_config=parser_config,
metadata_config=metadata_config,
file_id=file_id,
content_type=content_type,
),
filename=filename,
file_bytes=doc.filebytes,
custom_context=custom_context,
)

@retry(
stop=stop_after_attempt(DEFAULT_MAX_RETRIES),
wait=wait_fixed(DEFAULT_SLEEP_RETRY_SECONDS),
retry=retry_if_not_exception_type((InvalidSchema, CompassClientError)),
)
def process_file_bytes(
self,
*,
filename: str,
file_bytes: bytes,
file_id: Optional[str] = None,
content_type: Optional[str] = None,
parser_config: Optional[ParserConfig] = None,
metadata_config: Optional[MetadataConfig] = None,
custom_context: Optional[Fn_or_Dict] = None,
) -> list[CompassDocument]:
"""
Process a file.

The method takes in a file, its id, its byte array,
and the parser/metadata config.
If the config is None, then it uses the default configs passed by parameter when
creating the client. This makes the CompassParserClient stateful for
convenience, that is, one can pass in the parser/metadata config only once when
creating the CompassParserClient, and process files without having to pass the
config every time.

:param filename: Filename to process.
:param file_bytes: byte content of the file
:param file_id: Id for the file.
:param content_type: Content type of the file.
:param parser_config: ParserConfig object with the config to use for parsing the
file.
:param metadata_config: MetadataConfig object with the config to use for
extracting metadata for each document.
:param custom_context: Additional data to add to compass document. Fields will
be filterable but not semantically searchable. Can either be a dictionary
or a callable that takes a CompassDocument and returns a dictionary.

:returns: List of resulting documents
"""
return self._process_file_bytes(
params=self._get_file_params(
parser_config=parser_config,
metadata_config=metadata_config,
file_id=file_id,
content_type=content_type,
),
filename=filename,
file_bytes=file_bytes,
custom_context=custom_context,
)

def _get_file_params(
self,
*,
parser_config: Optional[ParserConfig] = None,
metadata_config: Optional[MetadataConfig] = None,
file_id: Optional[str] = None,
content_type: Optional[str] = None,
):
parser_config = parser_config or self.parser_config
metadata_config = metadata_config or self.metadata_config

params = ProcessFileParameters(
return ProcessFileParameters(
parser_config=parser_config,
metadata_config=metadata_config,
doc_id=file_id,
content_type=content_type,
)

def _process_file_bytes(
self,
*,
params: ProcessFileParameters,
filename: str,
file_bytes: bytes,
custom_context: Fn_or_Dict | None = None,
) -> list[CompassDocument]:
if len(file_bytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES:
max_size_mb = DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000
logger.error(
f"File too large, supported file size is {max_size_mb} mb"
+ f"filename {filename}"
)
return []
headers = None
if self.bearer_token:
headers = {"Authorization": f"Bearer {self.bearer_token}"}

res = self.session.post(
url=f"{self.parser_url}/v1/process_file",
data={"data": json.dumps(params.model_dump())},
files={"file": (filename, doc.filebytes)},
files={"file": (filename, file_bytes)},
headers=headers,
)