Unstructured-IO · yuming-long · Oct 2, 2024 · Sep 25, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -32,7 +32,7 @@
     ],
 )
 def test_integration_split_pdf_has_same_output_as_non_split(
-    concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog
+    concurrency_level: int, filename: str, expected_ok: bool, strategy: str
 ):
     """
     Tests that output that we get from the split-by-page pdf is the same as from non-split.
@@ -74,7 +74,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
         resp_split = client.general.partition(request=req)
     except (HTTPValidationError, AttributeError) as exc:
         if not expected_ok:
-            assert "The file does not appear to be a valid PDF." in caplog.text
             assert "File does not appear to be a valid PDF" in str(exc)
             return
         else:

diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error():
 
 
 def test_unit_is_pdf_valid_pdf():
-    """Test is pdf method returns True for valid pdf file (has .pdf extension and can be read)."""
+    """Test is pdf method returns True for valid pdf file with filename."""
     filename = "_sample_docs/layout-parser-paper-fast.pdf"
 
     with open(filename, "rb") as f:
@@ -237,28 +237,48 @@ def test_unit_is_pdf_valid_pdf():
     assert result is True
 
 
-def test_unit_is_pdf_invalid_extension(caplog):
+def test_unit_is_pdf_valid_pdf_without_file_extension():
+    """Test is pdf method returns True for file with valid pdf content without basing on file extension."""
+    filename = "_sample_docs/layout-parser-paper-fast.pdf"
+
+    with open(filename, "rb") as f:
+        file = shared.Files(
+            content=f.read(),
+            file_name="uuid1234",
+        )
+
+    result = pdf_utils.is_pdf(file)
+
+    assert result is True
+
+
+def test_unit_is_pdf_invalid_extension():
     """Test is pdf method returns False for file with invalid extension."""
     file = shared.Files(content=b"txt_content", file_name="test_file.txt")
 
-    with caplog.at_level(logging.INFO):
-        result = pdf_utils.is_pdf(file)
+    result = pdf_utils.is_pdf(file)
 
     assert result is False
-    assert "Given file doesn't have '.pdf' extension" in caplog.text
 
 
-def test_unit_is_pdf_invalid_pdf(caplog):
+def test_unit_is_pdf_invalid_pdf():
     """Test is pdf method returns False for file with invalid pdf content."""
     file = shared.Files(content=b"invalid_pdf_content", file_name="test_file.pdf")
 
-    with caplog.at_level(logging.WARNING):
-        result = pdf_utils.is_pdf(file)
+    result = pdf_utils.is_pdf(file)
 
     assert result is False
-    assert "The file does not appear to be a valid PDF." in caplog.text
 
 
+def test_unit_is_pdf_invalid_pdf_without_file_extension():
+    """Test is pdf method returns False for file with invalid pdf content without basing on file extension."""
+    file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234")
+
+    result = pdf_utils.is_pdf(file)
+
+    assert result is False
+
+
 def test_unit_get_starting_page_number_missing_key():
     """Test _get_starting_page_number method with missing key."""
     form_data = {}

diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -56,25 +56,19 @@ def get_pdf_pages(
 def is_pdf(file: shared.Files) -> bool:
     """Checks if the given file is a PDF.
 
-    First it checks the file extension and if it is equal to `.pdf`, then
-    it tries to read that file. If there is no error then we assume it is a proper PDF.
+    Tries to read that file. If there is no error then we assume it is a proper PDF.
 
     Args:
         file: The file to be checked.
 
     Returns:
         True if the file is a PDF, False otherwise.
     """
-    if not file.file_name.endswith(".pdf"):
-        logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
-        return False
 
     try:
         content = cast(bytes, file.content)
         PdfReader(io.BytesIO(content), strict=True)
-    except (PdfReadError, UnicodeDecodeError) as exc:
-        logger.error(exc)
-        logger.warning("The file does not appear to be a valid PDF.")
+    except (PdfReadError, UnicodeDecodeError):
         return False
 
     return True