fix: PyPDFToDocument initializes documents with content and meta (#8698)

julian-risch · web-flow · commit dd9660f90d8c · 2025-01-09T19:12:10.000Z
* initialize document with content and meta

* update test

* add test checking that not only content is used for id generation
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -155,7 +155,7 @@ def from_dict(cls, data):
         """
         return default_from_dict(cls, data)
 
-    def _default_convert(self, reader: "PdfReader") -> Document:
+    def _default_convert(self, reader: "PdfReader") -> str:
         texts = []
         for page in reader.pages:
             texts.append(
@@ -170,7 +170,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
                 )
             )
         text = "\f".join(texts)
-        return Document(content=text)
+        return text
 
     @component.output_types(documents=List[Document])
     def run(
@@ -205,14 +205,14 @@ def run(
                 continue
             try:
                 pdf_reader = PdfReader(io.BytesIO(bytestream.data))
-                document = self._default_convert(pdf_reader)
+                text = self._default_convert(pdf_reader)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                 )
                 continue
 
-            if document.content is None or document.content.strip() == "":
+            if text is None or text.strip() == "":
                 logger.warning(
                     "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
                     source=source,
@@ -222,7 +222,7 @@ def run(
 
             if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                 merged_metadata["file_path"] = os.path.basename(file_path)
-            document.meta = merged_metadata
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml b/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.
diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -113,8 +113,8 @@ def test_default_convert(self):
             layout_mode_font_height_weight=1.5,
         )
 
-        doc = converter._default_convert(mock_reader)
-        assert doc.content == "Page 1 content\fPage 2 content"
+        text = converter._default_convert(mock_reader)
+        assert text == "Page 1 content\fPage 2 content"
 
         expected_params = {
             "extraction_mode": "layout",
@@ -209,3 +209,7 @@ def test_run_empty_document(self, caplog, test_files_path):
             output = PyPDFToDocument().run(sources=paths)
             assert "PyPDFToDocument could not extract text from the file" in caplog.text
             assert output["documents"][0].content == ""
+
+            # Check that meta is used when the returned document is initialized and thus when doc id is generated
+            assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
+            assert output["documents"][0].id != Document(content="").id

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.