4
4
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
5
5
6
6
from pathlib import Path
7
- from typing import Any , Dict , List , Optional , Tuple
7
+ from typing import Any , List
8
8
9
- import cv2
10
- import fitz
11
9
import numpy as np
10
+ import pypdfium2 as pdfium
12
11
13
- from doctr .utils .common_types import AbstractFile , Bbox
12
+ from doctr .utils .common_types import AbstractFile
14
13
15
- __all__ = ['read_pdf' , 'PDF' ]
14
+ __all__ = ['read_pdf' ]
16
15
17
16
18
- def read_pdf (file : AbstractFile , ** kwargs : Any ) -> fitz . Document :
17
+ def read_pdf (file : AbstractFile , scale : float = 2 , ** kwargs : Any ) -> List [ np . ndarray ] :
19
18
"""Read a PDF file and convert it into an image in numpy format
20
19
21
20
Example::
@@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
24
23
25
24
Args:
26
25
file: the path to the PDF file
26
+ scale: rendering scale (1 corresponds to 72dpi)
27
27
Returns:
28
28
the list of pages decoded as numpy ndarray of shape H x W x 3
29
29
"""
30
30
31
+ if not isinstance (file , (str , Path , bytes )):
32
+ raise TypeError ("unsupported object type for argument 'file'" )
33
+
31
34
if isinstance (file , (str , Path )) and not Path (file ).is_file ():
32
35
raise FileNotFoundError (f"unable to access { file } " )
33
36
34
- fitz_args : Dict [str , AbstractFile ] = {}
35
-
36
- if isinstance (file , (str , Path )):
37
- fitz_args ['filename' ] = file
38
- elif isinstance (file , bytes ):
39
- fitz_args ['stream' ] = file
40
- else :
41
- raise TypeError ("unsupported object type for argument 'file'" )
42
-
43
37
# Read pages with fitz and convert them to numpy ndarrays
44
- return fitz .open (** fitz_args , filetype = "pdf" , ** kwargs )
45
-
46
-
47
- def convert_page_to_numpy (
48
- page : fitz .fitz .Page ,
49
- output_size : Optional [Tuple [int , int ]] = None ,
50
- bgr_output : bool = False ,
51
- default_scales : Tuple [float , float ] = (2 , 2 ),
52
- ) -> np .ndarray :
53
- """Convert a fitz page to a numpy-formatted image
54
-
55
- Args:
56
- page: the page of a file read with PyMuPDF
57
- output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
58
- if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
59
- rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
60
- default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
61
- corresponds to 72 dpi rendering.
62
-
63
- Returns:
64
- the rendered image in numpy format
65
- """
66
-
67
- # If no output size is specified, keep the origin one
68
- if output_size is not None :
69
- scales = (output_size [1 ] / page .MediaBox [2 ], output_size [0 ] / page .MediaBox [3 ])
70
- else :
71
- # Default 72 DPI (scales of (1, 1)) is unnecessarily low
72
- scales = default_scales
73
-
74
- transform_matrix = fitz .Matrix (* scales )
75
-
76
- # Generate the pixel map using the transformation matrix
77
- pixmap = page .get_pixmap (matrix = transform_matrix )
78
- # Decode it into a numpy
79
- img = np .frombuffer (pixmap .samples , dtype = np .uint8 ).reshape (pixmap .height , pixmap .width , 3 )
80
-
81
- # Switch the channel order
82
- if bgr_output :
83
- img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
84
-
85
- return img
86
-
87
-
88
- class PDF :
89
- """PDF document template
90
-
91
- Args:
92
- doc: input PDF document
93
- """
94
- def __init__ (self , doc : fitz .Document ) -> None :
95
- self .doc = doc
96
-
97
- def as_images (self , ** kwargs ) -> List [np .ndarray ]:
98
- """Convert all document pages to images
99
-
100
- Example::
101
- >>> from doctr.documents import DocumentFile
102
- >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
103
-
104
- Args:
105
- kwargs: keyword arguments of `convert_page_to_numpy`
106
- Returns:
107
- the list of pages decoded as numpy ndarray of shape H x W x 3
108
- """
109
- return [convert_page_to_numpy (page , ** kwargs ) for page in self .doc ]
110
-
111
- def get_page_lines (self , idx , ** kwargs ) -> List [Tuple [Bbox , str ]]:
112
- """Get the annotations for all lines of a given page"""
113
- lines : List [Tuple [Bbox , str ]] = []
114
- prev_block , prev_line = - 1 , - 1
115
- current_line = []
116
- xmin , ymin , xmax , ymax = 0 , 0 , 0 , 0
117
- # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
118
- for info in self .doc [idx ].get_text_words (** kwargs ):
119
- if prev_block == info [- 3 ] and prev_line == info [- 2 ]:
120
- current_line .append (info [4 ])
121
- xmin , ymin = min (xmin , info [0 ]), min (ymin , info [1 ])
122
- xmax , ymax = max (xmax , info [2 ]), max (ymax , info [3 ])
123
- else :
124
- if len (current_line ) > 0 :
125
- lines .append (((xmin , ymin , xmax , ymax ), " " .join (current_line )))
126
- current_line = [info [4 ]]
127
- prev_block , prev_line = info [- 3 ], info [- 2 ]
128
- xmin , ymin , xmax , ymax = info [:4 ]
129
-
130
- if len (current_line ) > 0 :
131
- lines .append (((xmin , ymin , xmax , ymax ), " " .join (current_line )))
132
-
133
- return lines
134
-
135
- def get_lines (self , ** kwargs ) -> List [List [Tuple [Bbox , str ]]]:
136
- """Get the annotations for all lines in the document
137
-
138
- Example::
139
- >>> from doctr.documents import DocumentFile
140
- >>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
141
-
142
- Args:
143
- kwargs: keyword arguments of `fitz.Page.get_text_words`
144
- Returns:
145
- the list of pages annotations, represented as a list of tuple (bounding box, value)
146
- """
147
- return [self .get_page_lines (idx , ** kwargs ) for idx in range (len (self .doc ))]
148
-
149
- def get_page_words (self , idx , ** kwargs ) -> List [Tuple [Bbox , str ]]:
150
- """Get the annotations for all words of a given page"""
151
-
152
- # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
153
- return [(info [:4 ], info [4 ]) for info in self .doc [idx ].get_text_words (** kwargs )]
154
-
155
- def get_words (self , ** kwargs ) -> List [List [Tuple [Bbox , str ]]]:
156
- """Get the annotations for all words in the document
157
-
158
- Example::
159
- >>> from doctr.documents import DocumentFile
160
- >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
161
-
162
- Args:
163
- kwargs: keyword arguments of `fitz.Page.get_text_words`
164
- Returns:
165
- the list of pages annotations, represented as a list of tuple (bounding box, value)
166
- """
167
- return [self .get_page_words (idx , ** kwargs ) for idx in range (len (self .doc ))]
168
-
169
- def get_page_artefacts (self , idx ) -> List [Tuple [float , float , float , float ]]:
170
- return [tuple (self .doc [idx ].get_image_bbox (artefact )) # type: ignore[misc]
171
- for artefact in self .doc [idx ].get_images (full = True )]
172
-
173
- def get_artefacts (self ) -> List [List [Tuple [float , float , float , float ]]]:
174
- """Get the artefacts for the entire document
175
-
176
- Example::
177
- >>> from doctr.documents import DocumentFile
178
- >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
179
-
180
- Returns:
181
- the list of pages artefacts, represented as a list of bounding boxes
182
- """
183
-
184
- return [self .get_page_artefacts (idx ) for idx in range (len (self .doc ))]
38
+ return [np .asarray (img ) for img , _ in pdfium .render_pdf (file , scale = scale )]
0 commit comments