refactor: Switched from PyMuPDF to pypdfium2 (mindee#829)

fg-mindee · web-flow · commit 2581daadad41 · 2022-02-24T17:26:59.000+01:00
* chore: Updated PDF lib

* refactor: Refactored PDF parsing

* test: Updated unittests

* docs: Updated instructions

* refactor: Switched to another PDF backend

* docs: Updated documentation

* fix: Fixed demo

* refactor: Removed legacy imports

* style: Updated mypy config

* fix: Fixed read_pdf

* chore: Updated deps

* test: Fixed unittests

* fix: Fixed analysis script

* chore: Fixed requirements

* test: Removed PyMuPDF from unittests

* chore: Removed PyMuPDF

* test: Fixed unittest

* fix: Fixed Dockerfile
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -12,9 +12,9 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Build docker image
-        run: docker build . -t doctr-py3.8.1-tf2.4-slim
+        run: docker build . -t doctr-tf-py3.8-slim
       - name: Run docker container
-        run: docker run doctr-py3.8.1-tf2.4-slim python -c 'import doctr'
+        run: docker run doctr-tf-py3.8-slim python -c 'import doctr'
 
   pytest-api:
     runs-on: ubuntu-latest
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8.1-slim
+FROM python:3.8-slim
 
 ENV PYTHONUNBUFFERED 1
 ENV PYTHONDONTWRITEBYTECODE 1
diff --git a/README.md b/README.md
@@ -34,11 +34,11 @@ Documents can be interpreted from PDF or images:
 ```python
 from doctr.io import DocumentFile
 # PDF
-pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
+pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
 # Image
 single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
 # Webpage
-webpage_doc = DocumentFile.from_url("https://www.yoursite.com").as_images()
+webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
 # Multiple page images
 multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
 ```
@@ -51,7 +51,7 @@ from doctr.models import ocr_predictor
 
 model = ocr_predictor(pretrained=True)
 # PDF
-doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
+doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
 # Analyze
 result = model(doc)
 ```
diff --git a/demo/app.py b/demo/app.py
@@ -52,7 +52,7 @@ def main():
     uploaded_file = st.sidebar.file_uploader("Upload files", type=['pdf', 'png', 'jpeg', 'jpg'])
     if uploaded_file is not None:
         if uploaded_file.name.endswith('.pdf'):
-            doc = DocumentFile.from_pdf(uploaded_file.read()).as_images()
+            doc = DocumentFile.from_pdf(uploaded_file.read())
         else:
             doc = DocumentFile.from_images(uploaded_file.read())
         page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -82,13 +82,3 @@ High-performance file reading and conversion to processable structured data.
    .. automethod:: from_url
 
    .. automethod:: from_images
-
-.. autoclass:: PDF
-
-   .. automethod:: as_images
-
-   .. automethod:: get_words
-
-   .. automethod:: get_lines
-
-   .. automethod:: get_artefacts
diff --git a/doctr/io/pdf.py b/doctr/io/pdf.py
@@ -4,18 +4,17 @@
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
 
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, List
 
-import cv2
-import fitz
 import numpy as np
+import pypdfium2 as pdfium
 
-from doctr.utils.common_types import AbstractFile, Bbox
+from doctr.utils.common_types import AbstractFile
 
-__all__ = ['read_pdf', 'PDF']
+__all__ = ['read_pdf']
 
 
-def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
+def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.ndarray]:
     """Read a PDF file and convert it into an image in numpy format
 
     Example::
@@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
 
     Args:
         file: the path to the PDF file
+        scale: rendering scale (1 corresponds to 72dpi)
     Returns:
         the list of pages decoded as numpy ndarray of shape H x W x 3
     """
 
+    if not isinstance(file, (str, Path, bytes)):
+        raise TypeError("unsupported object type for argument 'file'")
+
     if isinstance(file, (str, Path)) and not Path(file).is_file():
         raise FileNotFoundError(f"unable to access {file}")
 
-    fitz_args: Dict[str, AbstractFile] = {}
-
-    if isinstance(file, (str, Path)):
-        fitz_args['filename'] = file
-    elif isinstance(file, bytes):
-        fitz_args['stream'] = file
-    else:
-        raise TypeError("unsupported object type for argument 'file'")
-
     # Read pages with fitz and convert them to numpy ndarrays
-    return fitz.open(**fitz_args, filetype="pdf", **kwargs)
-
-
-def convert_page_to_numpy(
-    page: fitz.fitz.Page,
-    output_size: Optional[Tuple[int, int]] = None,
-    bgr_output: bool = False,
-    default_scales: Tuple[float, float] = (2, 2),
-) -> np.ndarray:
-    """Convert a fitz page to a numpy-formatted image
-
-    Args:
-        page: the page of a file read with PyMuPDF
-        output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
-        if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
-        rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
-        default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
-            corresponds to 72 dpi rendering.
-
-    Returns:
-        the rendered image in numpy format
-    """
-
-    # If no output size is specified, keep the origin one
-    if output_size is not None:
-        scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3])
-    else:
-        # Default 72 DPI (scales of (1, 1)) is unnecessarily low
-        scales = default_scales
-
-    transform_matrix = fitz.Matrix(*scales)
-
-    # Generate the pixel map using the transformation matrix
-    pixmap = page.get_pixmap(matrix=transform_matrix)
-    # Decode it into a numpy
-    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3)
-
-    # Switch the channel order
-    if bgr_output:
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-
-    return img
-
-
-class PDF:
-    """PDF document template
-
-    Args:
-        doc: input PDF document
-    """
-    def __init__(self, doc: fitz.Document) -> None:
-        self.doc = doc
-
-    def as_images(self, **kwargs) -> List[np.ndarray]:
-        """Convert all document pages to images
-
-        Example::
-            >>> from doctr.documents import DocumentFile
-            >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-        Args:
-            kwargs: keyword arguments of `convert_page_to_numpy`
-        Returns:
-            the list of pages decoded as numpy ndarray of shape H x W x 3
-        """
-        return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
-
-    def get_page_lines(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
-        """Get the annotations for all lines of a given page"""
-        lines: List[Tuple[Bbox, str]] = []
-        prev_block, prev_line = -1, -1
-        current_line = []
-        xmin, ymin, xmax, ymax = 0, 0, 0, 0
-        # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
-        for info in self.doc[idx].get_text_words(**kwargs):
-            if prev_block == info[-3] and prev_line == info[-2]:
-                current_line.append(info[4])
-                xmin, ymin = min(xmin, info[0]), min(ymin, info[1])
-                xmax, ymax = max(xmax, info[2]), max(ymax, info[3])
-            else:
-                if len(current_line) > 0:
-                    lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
-                current_line = [info[4]]
-                prev_block, prev_line = info[-3], info[-2]
-                xmin, ymin, xmax, ymax = info[:4]
-
-        if len(current_line) > 0:
-            lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
-
-        return lines
-
-    def get_lines(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
-        """Get the annotations for all lines in the document
-
-        Example::
-            >>> from doctr.documents import DocumentFile
-            >>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
-
-        Args:
-            kwargs: keyword arguments of `fitz.Page.get_text_words`
-        Returns:
-            the list of pages annotations, represented as a list of tuple (bounding box, value)
-        """
-        return [self.get_page_lines(idx, **kwargs) for idx in range(len(self.doc))]
-
-    def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
-        """Get the annotations for all words of a given page"""
-
-        # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
-        return [(info[:4], info[4]) for info in self.doc[idx].get_text_words(**kwargs)]
-
-    def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
-        """Get the annotations for all words in the document
-
-        Example::
-            >>> from doctr.documents import DocumentFile
-            >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-        Args:
-            kwargs: keyword arguments of `fitz.Page.get_text_words`
-        Returns:
-            the list of pages annotations, represented as a list of tuple (bounding box, value)
-        """
-        return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
-
-    def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]:
-        return [tuple(self.doc[idx].get_image_bbox(artefact))  # type: ignore[misc]
-                for artefact in self.doc[idx].get_images(full=True)]
-
-    def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]:
-        """Get the artefacts for the entire document
-
-        Example::
-            >>> from doctr.documents import DocumentFile
-            >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-        Returns:
-            the list of pages artefacts, represented as a list of bounding boxes
-        """
-
-        return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
+    return [np.asarray(img) for img, _ in pdfium.render_pdf(file, scale=scale)]
diff --git a/doctr/io/reader.py b/doctr/io/reader.py
@@ -12,7 +12,7 @@
 
 from .html import read_html
 from .image import read_img_as_numpy
-from .pdf import PDF, read_pdf
+from .pdf import read_pdf
 
 __all__ = ['DocumentFile']
 
@@ -21,7 +21,7 @@ class DocumentFile:
     """Read a document from multiple extensions"""
 
     @classmethod
-    def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
+    def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
         """Read a PDF file
 
         Example::
@@ -31,15 +31,13 @@ def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
         Args:
             file: the path to the PDF file or a binary stream
         Returns:
-            a PDF document
+            the list of pages decoded as numpy ndarray of shape H x W x 3
         """
 
-        doc = read_pdf(file, **kwargs)
-
-        return PDF(doc)
+        return read_pdf(file, **kwargs)
 
     @classmethod
-    def from_url(cls, url: str, **kwargs) -> PDF:
+    def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
         """Interpret a web page as a PDF document
 
         Example::
@@ -49,7 +47,7 @@ def from_url(cls, url: str, **kwargs) -> PDF:
         Args:
             url: the URL of the target web page
         Returns:
-            a PDF document
+            the list of pages decoded as numpy ndarray of shape H x W x 3
         """
         pdf_stream = read_html(url)
         return cls.from_pdf(pdf_stream, **kwargs)
diff --git a/mypy.ini b/mypy.ini
@@ -12,7 +12,7 @@ ignore_missing_imports = True
 
 ignore_missing_imports = True
 
-[mypy-fitz.*]
+[mypy-pypdfium2.*]
 
 ignore_missing_imports = True
 
diff --git a/requirements-pt.txt b/requirements-pt.txt
@@ -2,7 +2,7 @@ numpy>=1.16.0
 scipy>=1.4.0
 h5py>=3.1.0
 opencv-python>=3.4.5.20
-PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
+pypdfium2>=0.14.0
 pyclipper>=1.2.0
 shapely>=1.6.0
 matplotlib>=3.1.0,<3.4.3
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@ numpy>=1.16.0
 scipy>=1.4.0
 h5py>=3.1.0
 opencv-python>=3.4.5.20
-PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
+pypdfium2>=0.14.0
 pyclipper>=1.2.0
 shapely>=1.6.0
 matplotlib>=3.1.0,<3.4.3
diff --git a/scripts/analyze.py b/scripts/analyze.py
@@ -25,7 +25,7 @@ def main(args):
     model = ocr_predictor(args.detection, args.recognition, pretrained=True)
 
     if args.path.endswith(".pdf"):
-        doc = DocumentFile.from_pdf(args.path).as_images()
+        doc = DocumentFile.from_pdf(args.path)
     else:
         doc = DocumentFile.from_images(args.path)
 
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
     "h5py>=3.1.0",
     "opencv-python>=3.4.5.20",
     "tensorflow>=2.4.0",
-    "PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12",  # 18.11 and 18.12 fail (issue #222)
+    "pypdfium2>=0.14.0",
     "pyclipper>=1.2.0",
     "shapely>=1.6.0",
     "matplotlib>=3.1.0,<3.4.3",
@@ -94,7 +94,7 @@ def deps_list(*pkgs):
     deps["scipy"],
     deps["h5py"],
     deps["opencv-python"],
-    deps["PyMuPDF"],
+    deps["pypdfium2"],
     deps["pyclipper"],
     deps["shapely"],
     deps["matplotlib"],
diff --git a/tests/common/test_io.py b/tests/common/test_io.py
diff --git a/tests/common/test_models.py b/tests/common/test_models.py
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/pytorch/test_models_zoo_pt.py b/tests/pytorch/test_models_zoo_pt.py
diff --git a/tests/tensorflow/test_models_zoo_tf.py b/tests/tensorflow/test_models_zoo_tf.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.8.1-slim`
	`1`	`+FROM python:3.8-slim`
`2`	`2`
`3`	`3`	`ENV PYTHONUNBUFFERED 1`
`4`	`4`	`ENV PYTHONDONTWRITEBYTECODE 1`