44# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
55
66from pathlib import Path
7- from typing import Any , Dict , List , Optional , Tuple
7+ from typing import Any , List
88
9- import cv2
10- import fitz
119import numpy as np
10+ import pypdfium2 as pdfium
1211
13- from doctr .utils .common_types import AbstractFile , Bbox
12+ from doctr .utils .common_types import AbstractFile
1413
15- __all__ = ['read_pdf' , 'PDF' ]
14+ __all__ = ['read_pdf' ]
1615
1716
18- def read_pdf (file : AbstractFile , ** kwargs : Any ) -> fitz . Document :
17+ def read_pdf (file : AbstractFile , scale : float = 2 , ** kwargs : Any ) -> List [ np . ndarray ] :
1918 """Read a PDF file and convert it into an image in numpy format
2019
2120 Example::
@@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
2423
2524 Args:
2625 file: the path to the PDF file
26+ scale: rendering scale (1 corresponds to 72dpi)
2727 Returns:
2828 the list of pages decoded as numpy ndarray of shape H x W x 3
2929 """
3030
31+ if not isinstance (file , (str , Path , bytes )):
32+ raise TypeError ("unsupported object type for argument 'file'" )
33+
3134 if isinstance (file , (str , Path )) and not Path (file ).is_file ():
3235 raise FileNotFoundError (f"unable to access { file } " )
3336
34- fitz_args : Dict [str , AbstractFile ] = {}
35-
36- if isinstance (file , (str , Path )):
37- fitz_args ['filename' ] = file
38- elif isinstance (file , bytes ):
39- fitz_args ['stream' ] = file
40- else :
41- raise TypeError ("unsupported object type for argument 'file'" )
42-
4337 # Read pages with fitz and convert them to numpy ndarrays
44- return fitz .open (** fitz_args , filetype = "pdf" , ** kwargs )
45-
46-
47- def convert_page_to_numpy (
48- page : fitz .fitz .Page ,
49- output_size : Optional [Tuple [int , int ]] = None ,
50- bgr_output : bool = False ,
51- default_scales : Tuple [float , float ] = (2 , 2 ),
52- ) -> np .ndarray :
53- """Convert a fitz page to a numpy-formatted image
54-
55- Args:
56- page: the page of a file read with PyMuPDF
57- output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
58- if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
59- rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
60- default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
61- corresponds to 72 dpi rendering.
62-
63- Returns:
64- the rendered image in numpy format
65- """
66-
67- # If no output size is specified, keep the origin one
68- if output_size is not None :
69- scales = (output_size [1 ] / page .MediaBox [2 ], output_size [0 ] / page .MediaBox [3 ])
70- else :
71- # Default 72 DPI (scales of (1, 1)) is unnecessarily low
72- scales = default_scales
73-
74- transform_matrix = fitz .Matrix (* scales )
75-
76- # Generate the pixel map using the transformation matrix
77- pixmap = page .get_pixmap (matrix = transform_matrix )
78- # Decode it into a numpy
79- img = np .frombuffer (pixmap .samples , dtype = np .uint8 ).reshape (pixmap .height , pixmap .width , 3 )
80-
81- # Switch the channel order
82- if bgr_output :
83- img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
84-
85- return img
86-
87-
88- class PDF :
89- """PDF document template
90-
91- Args:
92- doc: input PDF document
93- """
94- def __init__ (self , doc : fitz .Document ) -> None :
95- self .doc = doc
96-
97- def as_images (self , ** kwargs ) -> List [np .ndarray ]:
98- """Convert all document pages to images
99-
100- Example::
101- >>> from doctr.documents import DocumentFile
102- >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
103-
104- Args:
105- kwargs: keyword arguments of `convert_page_to_numpy`
106- Returns:
107- the list of pages decoded as numpy ndarray of shape H x W x 3
108- """
109- return [convert_page_to_numpy (page , ** kwargs ) for page in self .doc ]
110-
111- def get_page_lines (self , idx , ** kwargs ) -> List [Tuple [Bbox , str ]]:
112- """Get the annotations for all lines of a given page"""
113- lines : List [Tuple [Bbox , str ]] = []
114- prev_block , prev_line = - 1 , - 1
115- current_line = []
116- xmin , ymin , xmax , ymax = 0 , 0 , 0 , 0
117- # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
118- for info in self .doc [idx ].get_text_words (** kwargs ):
119- if prev_block == info [- 3 ] and prev_line == info [- 2 ]:
120- current_line .append (info [4 ])
121- xmin , ymin = min (xmin , info [0 ]), min (ymin , info [1 ])
122- xmax , ymax = max (xmax , info [2 ]), max (ymax , info [3 ])
123- else :
124- if len (current_line ) > 0 :
125- lines .append (((xmin , ymin , xmax , ymax ), " " .join (current_line )))
126- current_line = [info [4 ]]
127- prev_block , prev_line = info [- 3 ], info [- 2 ]
128- xmin , ymin , xmax , ymax = info [:4 ]
129-
130- if len (current_line ) > 0 :
131- lines .append (((xmin , ymin , xmax , ymax ), " " .join (current_line )))
132-
133- return lines
134-
135- def get_lines (self , ** kwargs ) -> List [List [Tuple [Bbox , str ]]]:
136- """Get the annotations for all lines in the document
137-
138- Example::
139- >>> from doctr.documents import DocumentFile
140- >>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
141-
142- Args:
143- kwargs: keyword arguments of `fitz.Page.get_text_words`
144- Returns:
145- the list of pages annotations, represented as a list of tuple (bounding box, value)
146- """
147- return [self .get_page_lines (idx , ** kwargs ) for idx in range (len (self .doc ))]
148-
149- def get_page_words (self , idx , ** kwargs ) -> List [Tuple [Bbox , str ]]:
150- """Get the annotations for all words of a given page"""
151-
152- # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
153- return [(info [:4 ], info [4 ]) for info in self .doc [idx ].get_text_words (** kwargs )]
154-
155- def get_words (self , ** kwargs ) -> List [List [Tuple [Bbox , str ]]]:
156- """Get the annotations for all words in the document
157-
158- Example::
159- >>> from doctr.documents import DocumentFile
160- >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
161-
162- Args:
163- kwargs: keyword arguments of `fitz.Page.get_text_words`
164- Returns:
165- the list of pages annotations, represented as a list of tuple (bounding box, value)
166- """
167- return [self .get_page_words (idx , ** kwargs ) for idx in range (len (self .doc ))]
168-
169- def get_page_artefacts (self , idx ) -> List [Tuple [float , float , float , float ]]:
170- return [tuple (self .doc [idx ].get_image_bbox (artefact )) # type: ignore[misc]
171- for artefact in self .doc [idx ].get_images (full = True )]
172-
173- def get_artefacts (self ) -> List [List [Tuple [float , float , float , float ]]]:
174- """Get the artefacts for the entire document
175-
176- Example::
177- >>> from doctr.documents import DocumentFile
178- >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
179-
180- Returns:
181- the list of pages artefacts, represented as a list of bounding boxes
182- """
183-
184- return [self .get_page_artefacts (idx ) for idx in range (len (self .doc ))]
38+ return [np .asarray (img ) for img , _ in pdfium .render_pdf (file , scale = scale )]
0 commit comments