Skip to content

Commit dd9660f

Browse files
authored
fix: PyPDFToDocument initializes documents with content and meta (#8698)
* initialize document with content and meta * update test * add test checking that not only content is used for id generation
1 parent fe9b1e2 commit dd9660f

File tree

3 files changed

+15
-7
lines changed

3 files changed

+15
-7
lines changed

haystack/components/converters/pypdf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def from_dict(cls, data):
155155
"""
156156
return default_from_dict(cls, data)
157157

158-
def _default_convert(self, reader: "PdfReader") -> Document:
158+
def _default_convert(self, reader: "PdfReader") -> str:
159159
texts = []
160160
for page in reader.pages:
161161
texts.append(
@@ -170,7 +170,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
170170
)
171171
)
172172
text = "\f".join(texts)
173-
return Document(content=text)
173+
return text
174174

175175
@component.output_types(documents=List[Document])
176176
def run(
@@ -205,14 +205,14 @@ def run(
205205
continue
206206
try:
207207
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
208-
document = self._default_convert(pdf_reader)
208+
text = self._default_convert(pdf_reader)
209209
except Exception as e:
210210
logger.warning(
211211
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
212212
)
213213
continue
214214

215-
if document.content is None or document.content.strip() == "":
215+
if text is None or text.strip() == "":
216216
logger.warning(
217217
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
218218
source=source,
@@ -222,7 +222,7 @@ def run(
222222

223223
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
224224
merged_metadata["file_path"] = os.path.basename(file_path)
225-
document.meta = merged_metadata
225+
document = Document(content=text, meta=merged_metadata)
226226
documents.append(document)
227227

228228
return {"documents": documents}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.

test/components/converters/test_pypdf_to_document.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ def test_default_convert(self):
113113
layout_mode_font_height_weight=1.5,
114114
)
115115

116-
doc = converter._default_convert(mock_reader)
117-
assert doc.content == "Page 1 content\fPage 2 content"
116+
text = converter._default_convert(mock_reader)
117+
assert text == "Page 1 content\fPage 2 content"
118118

119119
expected_params = {
120120
"extraction_mode": "layout",
@@ -209,3 +209,7 @@ def test_run_empty_document(self, caplog, test_files_path):
209209
output = PyPDFToDocument().run(sources=paths)
210210
assert "PyPDFToDocument could not extract text from the file" in caplog.text
211211
assert output["documents"][0].content == ""
212+
213+
# Check that meta is used when the returned document is initialized and thus when doc id is generated
214+
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
215+
assert output["documents"][0].id != Document(content="").id

0 commit comments

Comments
 (0)