Skip to content

Commit 2581daa

Browse files
authored
refactor: Switched from PyMuPDF to pypdfium2 (mindee#829)
* chore: Updated PDF lib * refactor: Refactored PDF parsing * test: Updated unittests * docs: Updated instructions * refactor: Switched to another PDF backend * docs: Updated documentation * fix: Fixed demo * refactor: Removed legacy imports * style: Updated mypy config * fix: Fixed read_pdf * chore: Updated deps * test: Fixed unittests * fix: Fixed analysis script * chore: Fixed requirements * test: Removed PyMuPDF from unittests * chore: Removed PyMuPDF * test: Fixed unittest * fix: Fixed Dockerfile
1 parent 0f79736 commit 2581daa

17 files changed

+49
-251
lines changed

.github/workflows/docker.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ jobs:
1212
steps:
1313
- uses: actions/checkout@v2
1414
- name: Build docker image
15-
run: docker build . -t doctr-py3.8.1-tf2.4-slim
15+
run: docker build . -t doctr-tf-py3.8-slim
1616
- name: Run docker container
17-
run: docker run doctr-py3.8.1-tf2.4-slim python -c 'import doctr'
17+
run: docker run doctr-tf-py3.8-slim python -c 'import doctr'
1818

1919
pytest-api:
2020
runs-on: ubuntu-latest

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.8.1-slim
1+
FROM python:3.8-slim
22

33
ENV PYTHONUNBUFFERED 1
44
ENV PYTHONDONTWRITEBYTECODE 1

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ Documents can be interpreted from PDF or images:
3434
```python
3535
from doctr.io import DocumentFile
3636
# PDF
37-
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
37+
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
3838
# Image
3939
single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
4040
# Webpage
41-
webpage_doc = DocumentFile.from_url("https://www.yoursite.com").as_images()
41+
webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
4242
# Multiple page images
4343
multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
4444
```
@@ -51,7 +51,7 @@ from doctr.models import ocr_predictor
5151

5252
model = ocr_predictor(pretrained=True)
5353
# PDF
54-
doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
54+
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
5555
# Analyze
5656
result = model(doc)
5757
```

demo/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def main():
5252
uploaded_file = st.sidebar.file_uploader("Upload files", type=['pdf', 'png', 'jpeg', 'jpg'])
5353
if uploaded_file is not None:
5454
if uploaded_file.name.endswith('.pdf'):
55-
doc = DocumentFile.from_pdf(uploaded_file.read()).as_images()
55+
doc = DocumentFile.from_pdf(uploaded_file.read())
5656
else:
5757
doc = DocumentFile.from_images(uploaded_file.read())
5858
page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1

docs/source/io.rst

-10
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,3 @@ High-performance file reading and conversion to processable structured data.
8282
.. automethod:: from_url
8383

8484
.. automethod:: from_images
85-
86-
.. autoclass:: PDF
87-
88-
.. automethod:: as_images
89-
90-
.. automethod:: get_words
91-
92-
.. automethod:: get_lines
93-
94-
.. automethod:: get_artefacts

doctr/io/pdf.py

+10-156
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,17 @@
44
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
55

66
from pathlib import Path
7-
from typing import Any, Dict, List, Optional, Tuple
7+
from typing import Any, List
88

9-
import cv2
10-
import fitz
119
import numpy as np
10+
import pypdfium2 as pdfium
1211

13-
from doctr.utils.common_types import AbstractFile, Bbox
12+
from doctr.utils.common_types import AbstractFile
1413

15-
__all__ = ['read_pdf', 'PDF']
14+
__all__ = ['read_pdf']
1615

1716

18-
def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
17+
def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.ndarray]:
1918
"""Read a PDF file and convert it into an image in numpy format
2019
2120
Example::
@@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
2423
2524
Args:
2625
file: the path to the PDF file
26+
scale: rendering scale (1 corresponds to 72dpi)
2727
Returns:
2828
the list of pages decoded as numpy ndarray of shape H x W x 3
2929
"""
3030

31+
if not isinstance(file, (str, Path, bytes)):
32+
raise TypeError("unsupported object type for argument 'file'")
33+
3134
if isinstance(file, (str, Path)) and not Path(file).is_file():
3235
raise FileNotFoundError(f"unable to access {file}")
3336

34-
fitz_args: Dict[str, AbstractFile] = {}
35-
36-
if isinstance(file, (str, Path)):
37-
fitz_args['filename'] = file
38-
elif isinstance(file, bytes):
39-
fitz_args['stream'] = file
40-
else:
41-
raise TypeError("unsupported object type for argument 'file'")
42-
4337
# Read pages with fitz and convert them to numpy ndarrays
44-
return fitz.open(**fitz_args, filetype="pdf", **kwargs)
45-
46-
47-
def convert_page_to_numpy(
48-
page: fitz.fitz.Page,
49-
output_size: Optional[Tuple[int, int]] = None,
50-
bgr_output: bool = False,
51-
default_scales: Tuple[float, float] = (2, 2),
52-
) -> np.ndarray:
53-
"""Convert a fitz page to a numpy-formatted image
54-
55-
Args:
56-
page: the page of a file read with PyMuPDF
57-
output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
58-
if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
59-
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
60-
default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
61-
corresponds to 72 dpi rendering.
62-
63-
Returns:
64-
the rendered image in numpy format
65-
"""
66-
67-
# If no output size is specified, keep the origin one
68-
if output_size is not None:
69-
scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3])
70-
else:
71-
# Default 72 DPI (scales of (1, 1)) is unnecessarily low
72-
scales = default_scales
73-
74-
transform_matrix = fitz.Matrix(*scales)
75-
76-
# Generate the pixel map using the transformation matrix
77-
pixmap = page.get_pixmap(matrix=transform_matrix)
78-
# Decode it into a numpy
79-
img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3)
80-
81-
# Switch the channel order
82-
if bgr_output:
83-
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
84-
85-
return img
86-
87-
88-
class PDF:
89-
"""PDF document template
90-
91-
Args:
92-
doc: input PDF document
93-
"""
94-
def __init__(self, doc: fitz.Document) -> None:
95-
self.doc = doc
96-
97-
def as_images(self, **kwargs) -> List[np.ndarray]:
98-
"""Convert all document pages to images
99-
100-
Example::
101-
>>> from doctr.documents import DocumentFile
102-
>>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
103-
104-
Args:
105-
kwargs: keyword arguments of `convert_page_to_numpy`
106-
Returns:
107-
the list of pages decoded as numpy ndarray of shape H x W x 3
108-
"""
109-
return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
110-
111-
def get_page_lines(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
112-
"""Get the annotations for all lines of a given page"""
113-
lines: List[Tuple[Bbox, str]] = []
114-
prev_block, prev_line = -1, -1
115-
current_line = []
116-
xmin, ymin, xmax, ymax = 0, 0, 0, 0
117-
# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
118-
for info in self.doc[idx].get_text_words(**kwargs):
119-
if prev_block == info[-3] and prev_line == info[-2]:
120-
current_line.append(info[4])
121-
xmin, ymin = min(xmin, info[0]), min(ymin, info[1])
122-
xmax, ymax = max(xmax, info[2]), max(ymax, info[3])
123-
else:
124-
if len(current_line) > 0:
125-
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
126-
current_line = [info[4]]
127-
prev_block, prev_line = info[-3], info[-2]
128-
xmin, ymin, xmax, ymax = info[:4]
129-
130-
if len(current_line) > 0:
131-
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
132-
133-
return lines
134-
135-
def get_lines(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
136-
"""Get the annotations for all lines in the document
137-
138-
Example::
139-
>>> from doctr.documents import DocumentFile
140-
>>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
141-
142-
Args:
143-
kwargs: keyword arguments of `fitz.Page.get_text_words`
144-
Returns:
145-
the list of pages annotations, represented as a list of tuple (bounding box, value)
146-
"""
147-
return [self.get_page_lines(idx, **kwargs) for idx in range(len(self.doc))]
148-
149-
def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
150-
"""Get the annotations for all words of a given page"""
151-
152-
# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
153-
return [(info[:4], info[4]) for info in self.doc[idx].get_text_words(**kwargs)]
154-
155-
def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
156-
"""Get the annotations for all words in the document
157-
158-
Example::
159-
>>> from doctr.documents import DocumentFile
160-
>>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
161-
162-
Args:
163-
kwargs: keyword arguments of `fitz.Page.get_text_words`
164-
Returns:
165-
the list of pages annotations, represented as a list of tuple (bounding box, value)
166-
"""
167-
return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
168-
169-
def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]:
170-
return [tuple(self.doc[idx].get_image_bbox(artefact)) # type: ignore[misc]
171-
for artefact in self.doc[idx].get_images(full=True)]
172-
173-
def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]:
174-
"""Get the artefacts for the entire document
175-
176-
Example::
177-
>>> from doctr.documents import DocumentFile
178-
>>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
179-
180-
Returns:
181-
the list of pages artefacts, represented as a list of bounding boxes
182-
"""
183-
184-
return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
38+
return [np.asarray(img) for img, _ in pdfium.render_pdf(file, scale=scale)]

doctr/io/reader.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from .html import read_html
1414
from .image import read_img_as_numpy
15-
from .pdf import PDF, read_pdf
15+
from .pdf import read_pdf
1616

1717
__all__ = ['DocumentFile']
1818

@@ -21,7 +21,7 @@ class DocumentFile:
2121
"""Read a document from multiple extensions"""
2222

2323
@classmethod
24-
def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
24+
def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
2525
"""Read a PDF file
2626
2727
Example::
@@ -31,15 +31,13 @@ def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
3131
Args:
3232
file: the path to the PDF file or a binary stream
3333
Returns:
34-
a PDF document
34+
the list of pages decoded as numpy ndarray of shape H x W x 3
3535
"""
3636

37-
doc = read_pdf(file, **kwargs)
38-
39-
return PDF(doc)
37+
return read_pdf(file, **kwargs)
4038

4139
@classmethod
42-
def from_url(cls, url: str, **kwargs) -> PDF:
40+
def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
4341
"""Interpret a web page as a PDF document
4442
4543
Example::
@@ -49,7 +47,7 @@ def from_url(cls, url: str, **kwargs) -> PDF:
4947
Args:
5048
url: the URL of the target web page
5149
Returns:
52-
a PDF document
50+
the list of pages decoded as numpy ndarray of shape H x W x 3
5351
"""
5452
pdf_stream = read_html(url)
5553
return cls.from_pdf(pdf_stream, **kwargs)

mypy.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ ignore_missing_imports = True
1212

1313
ignore_missing_imports = True
1414

15-
[mypy-fitz.*]
15+
[mypy-pypdfium2.*]
1616

1717
ignore_missing_imports = True
1818

requirements-pt.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ numpy>=1.16.0
22
scipy>=1.4.0
33
h5py>=3.1.0
44
opencv-python>=3.4.5.20
5-
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
5+
pypdfium2>=0.14.0
66
pyclipper>=1.2.0
77
shapely>=1.6.0
88
matplotlib>=3.1.0,<3.4.3

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ numpy>=1.16.0
22
scipy>=1.4.0
33
h5py>=3.1.0
44
opencv-python>=3.4.5.20
5-
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
5+
pypdfium2>=0.14.0
66
pyclipper>=1.2.0
77
shapely>=1.6.0
88
matplotlib>=3.1.0,<3.4.3

scripts/analyze.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def main(args):
2525
model = ocr_predictor(args.detection, args.recognition, pretrained=True)
2626

2727
if args.path.endswith(".pdf"):
28-
doc = DocumentFile.from_pdf(args.path).as_images()
28+
doc = DocumentFile.from_pdf(args.path)
2929
else:
3030
doc = DocumentFile.from_images(args.path)
3131

setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
"h5py>=3.1.0",
4646
"opencv-python>=3.4.5.20",
4747
"tensorflow>=2.4.0",
48-
"PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12", # 18.11 and 18.12 fail (issue #222)
48+
"pypdfium2>=0.14.0",
4949
"pyclipper>=1.2.0",
5050
"shapely>=1.6.0",
5151
"matplotlib>=3.1.0,<3.4.3",
@@ -94,7 +94,7 @@ def deps_list(*pkgs):
9494
deps["scipy"],
9595
deps["h5py"],
9696
deps["opencv-python"],
97-
deps["PyMuPDF"],
97+
deps["pypdfium2"],
9898
deps["pyclipper"],
9999
deps["shapely"],
100100
deps["matplotlib"],

0 commit comments

Comments
 (0)