Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore Use pdf library to check file without extension #184

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
],
)
def test_integration_split_pdf_has_same_output_as_non_split(
concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog
concurrency_level: int, filename: str, expected_ok: bool, strategy: str
):
"""
Tests that output that we get from the split-by-page pdf is the same as from non-split.
Expand Down Expand Up @@ -74,7 +74,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
resp_split = client.general.partition(request=req)
except (HTTPValidationError, AttributeError) as exc:
if not expected_ok:
assert "The file does not appear to be a valid PDF." in caplog.text
assert "File does not appear to be a valid PDF" in str(exc)
return
else:
Expand Down
38 changes: 29 additions & 9 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error():


def test_unit_is_pdf_valid_pdf():
"""Test is pdf method returns True for valid pdf file (has .pdf extension and can be read)."""
"""Test is pdf method returns True for valid pdf file with filename."""
filename = "_sample_docs/layout-parser-paper-fast.pdf"

with open(filename, "rb") as f:
Expand All @@ -237,28 +237,48 @@ def test_unit_is_pdf_valid_pdf():
assert result is True


def test_unit_is_pdf_invalid_extension(caplog):
def test_unit_is_pdf_valid_pdf_without_file_extension():
"""Test is pdf method returns True for file with valid pdf content without basing on file extension."""
filename = "_sample_docs/layout-parser-paper-fast.pdf"

with open(filename, "rb") as f:
file = shared.Files(
content=f.read(),
file_name="uuid1234",
)

result = pdf_utils.is_pdf(file)

assert result is True


def test_unit_is_pdf_invalid_extension():
"""Test is pdf method returns False for file with invalid extension."""
file = shared.Files(content=b"txt_content", file_name="test_file.txt")

with caplog.at_level(logging.INFO):
result = pdf_utils.is_pdf(file)
result = pdf_utils.is_pdf(file)

assert result is False
assert "Given file doesn't have '.pdf' extension" in caplog.text


def test_unit_is_pdf_invalid_pdf(caplog):
def test_unit_is_pdf_invalid_pdf():
"""Test is pdf method returns False for file with invalid pdf content."""
file = shared.Files(content=b"invalid_pdf_content", file_name="test_file.pdf")

with caplog.at_level(logging.WARNING):
result = pdf_utils.is_pdf(file)
result = pdf_utils.is_pdf(file)

assert result is False
assert "The file does not appear to be a valid PDF." in caplog.text


def test_unit_is_pdf_invalid_pdf_without_file_extension():
"""Test is pdf method returns False for file with invalid pdf content without basing on file extension."""
file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234")

result = pdf_utils.is_pdf(file)

assert result is False


def test_unit_get_starting_page_number_missing_key():
"""Test _get_starting_page_number method with missing key."""
form_data = {}
Expand Down
10 changes: 2 additions & 8 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,19 @@ def get_pdf_pages(
def is_pdf(file: shared.Files) -> bool:
"""Checks if the given file is a PDF.

First it checks the file extension and if it is equal to `.pdf`, then
it tries to read that file. If there is no error then we assume it is a proper PDF.
Tries to read that file. If there is no error then we assume it is a proper PDF.

Args:
file: The file to be checked.

Returns:
True if the file is a PDF, False otherwise.
"""
if not file.file_name.endswith(".pdf"):
logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
return False

try:
content = cast(bytes, file.content)
PdfReader(io.BytesIO(content), strict=True)
except (PdfReadError, UnicodeDecodeError) as exc:
logger.error(exc)
logger.warning("The file does not appear to be a valid PDF.")
except (PdfReadError, UnicodeDecodeError):
return False

return True