Skip to content

Commit a487104

Browse files
committed
without forcing file extension
1 parent bb488f2 commit a487104

File tree

2 files changed

+29
-8
lines changed

2 files changed

+29
-8
lines changed

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error():
223223

224224

225225
def test_unit_is_pdf_valid_pdf():
226-
"""Test is pdf method returns True for valid pdf file (has .pdf extension and can be read)."""
226+
"""Test is pdf method returns True for valid pdf file with filename."""
227227
filename = "_sample_docs/layout-parser-paper-fast.pdf"
228228

229229
with open(filename, "rb") as f:
@@ -237,15 +237,30 @@ def test_unit_is_pdf_valid_pdf():
237237
assert result is True
238238

239239

240+
def test_unit_is_pdf_valid_pdf_without_file_extension(caplog):
241+
"""Test is pdf method returns True for file with valid pdf content without basing on file extension."""
242+
filename = "_sample_docs/layout-parser-paper-fast.pdf"
243+
244+
with open(filename, "rb") as f:
245+
file = shared.Files(
246+
content=f.read(),
247+
file_name="uuid1234",
248+
)
249+
250+
result = pdf_utils.is_pdf(file)
251+
252+
assert result is True
253+
254+
240255
def test_unit_is_pdf_invalid_extension(caplog):
241256
"""Test is pdf method returns False for file with invalid extension."""
242257
file = shared.Files(content=b"txt_content", file_name="test_file.txt")
243258

244-
with caplog.at_level(logging.INFO):
259+
with caplog.at_level(logging.WARNING):
245260
result = pdf_utils.is_pdf(file)
246261

247262
assert result is False
248-
assert "Given file doesn't have '.pdf' extension" in caplog.text
263+
assert "The file does not appear to be a valid PDF." in caplog.text
249264

250265

251266
def test_unit_is_pdf_invalid_pdf(caplog):
@@ -258,6 +273,16 @@ def test_unit_is_pdf_invalid_pdf(caplog):
258273
assert result is False
259274
assert "The file does not appear to be a valid PDF." in caplog.text
260275

276+
def test_unit_is_pdf_invalid_pdf_without_file_extension(caplog):
277+
"""Test is pdf method returns False for file with invalid pdf content without basing on file extension."""
278+
file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234")
279+
280+
with caplog.at_level(logging.WARNING):
281+
result = pdf_utils.is_pdf(file)
282+
283+
assert result is False
284+
assert "The file does not appear to be a valid PDF." in caplog.text
285+
261286

262287
def test_unit_get_starting_page_number_missing_key():
263288
"""Test _get_starting_page_number method with missing key."""

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,14 @@ def get_pdf_pages(
5656
def is_pdf(file: shared.Files) -> bool:
5757
"""Checks if the given file is a PDF.
5858
59-
First it checks the file extension and if it is equal to `.pdf`, then
60-
it tries to read that file. If there is no error then we assume it is a proper PDF.
59+
Tries to read that file. If there is no error then we assume it is a proper PDF.
6160
6261
Args:
6362
file: The file to be checked.
6463
6564
Returns:
6665
True if the file is a PDF, False otherwise.
6766
"""
68-
if not file.file_name.endswith(".pdf"):
69-
logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
70-
return False
7167

7268
try:
7369
content = cast(bytes, file.content)

0 commit comments

Comments
 (0)