Skip to content

Commit

Permalink
without forcing file extension
Browse files Browse the repository at this point in the history
  • Loading branch information
yuming-long committed Sep 25, 2024
1 parent bb488f2 commit a487104
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 8 deletions.
31 changes: 28 additions & 3 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error():


def test_unit_is_pdf_valid_pdf():
"""Test is pdf method returns True for valid pdf file (has .pdf extension and can be read)."""
"""Test is pdf method returns True for valid pdf file with filename."""
filename = "_sample_docs/layout-parser-paper-fast.pdf"

with open(filename, "rb") as f:
Expand All @@ -237,15 +237,30 @@ def test_unit_is_pdf_valid_pdf():
assert result is True


def test_unit_is_pdf_valid_pdf_without_file_extension(caplog):
"""Test is pdf method returns True for file with valid pdf content without basing on file extension."""
filename = "_sample_docs/layout-parser-paper-fast.pdf"

with open(filename, "rb") as f:
file = shared.Files(
content=f.read(),
file_name="uuid1234",
)

result = pdf_utils.is_pdf(file)

assert result is True


def test_unit_is_pdf_invalid_extension(caplog):
"""Test is pdf method returns False for file with invalid extension."""
file = shared.Files(content=b"txt_content", file_name="test_file.txt")

with caplog.at_level(logging.INFO):
with caplog.at_level(logging.WARNING):
result = pdf_utils.is_pdf(file)

assert result is False
assert "Given file doesn't have '.pdf' extension" in caplog.text
assert "The file does not appear to be a valid PDF." in caplog.text


def test_unit_is_pdf_invalid_pdf(caplog):
Expand All @@ -258,6 +273,16 @@ def test_unit_is_pdf_invalid_pdf(caplog):
assert result is False
assert "The file does not appear to be a valid PDF." in caplog.text

def test_unit_is_pdf_invalid_pdf_without_file_extension(caplog):
"""Test is pdf method returns False for file with invalid pdf content without basing on file extension."""
file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234")

with caplog.at_level(logging.WARNING):
result = pdf_utils.is_pdf(file)

assert result is False
assert "The file does not appear to be a valid PDF." in caplog.text


def test_unit_get_starting_page_number_missing_key():
"""Test _get_starting_page_number method with missing key."""
Expand Down
6 changes: 1 addition & 5 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,14 @@ def get_pdf_pages(
def is_pdf(file: shared.Files) -> bool:
"""Checks if the given file is a PDF.
First it checks the file extension and if it is equal to `.pdf`, then
it tries to read that file. If there is no error then we assume it is a proper PDF.
Tries to read that file. If there is no error then we assume it is a proper PDF.
Args:
file: The file to be checked.
Returns:
True if the file is a PDF, False otherwise.
"""
if not file.file_name.endswith(".pdf"):
logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
return False

try:
content = cast(bytes, file.content)
Expand Down

0 comments on commit a487104

Please sign in to comment.