-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_pdf_notes.py
81 lines (61 loc) · 3.1 KB
/
extract_pdf_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import popplerqt5
import sys
import PyQt5
resolution = 150
def main():
doc = popplerqt5.Poppler.Document.load(sys.argv[1])
if not doc:
print("Failed to load the file. Check if the provided document is valid PDF - \t[{}]".format(sys.argv[1]))
sys.exit(1)
print(sys.argv[1])
total_annotations = 0
for i in range(doc.numPages()):
#print("========= PAGE {} =========".format(i+1))
page = doc.page(i)
annotations = page.annotations()
(pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())
count = 0
if len(annotations) > 0:
for annotation in annotations:
if isinstance(annotation, popplerqt5.Poppler.Annotation):
total_annotations += 1
if isinstance(annotation, popplerqt5.Poppler.HighlightAnnotation):
quads = annotation.highlightQuads()
txt = ""
for quad in quads:
rect = (quad.points[0].x() * pwidth,
quad.points[0].y() * pheight,
quad.points[2].x() * pwidth,
quad.points[2].y() * pheight)
bdy = PyQt5.QtCore.QRectF()
bdy.setCoords(*rect)
txt = txt + str(page.text(bdy)) + ' '
#print("========= ANNOTATION =========")
print(txt)
if annotation.contents():
print("\t - {}".format(annotation.contents()))
if isinstance(annotation, popplerqt5.Poppler.GeomAnnotation):
count += 1
bounds = annotation.boundary()
# default we have height/width as per 72p rendering so converting to different resolution
(width, height) = (pwidth*resolution/72, pheight*resolution/72)
bdy = PyQt5.QtCore.QRectF(
bounds.left()*width,
bounds.top()*height,
bounds.width()*width,
bounds.height()*height
)
page.renderToImage(resolution, resolution, bdy.left(), bdy.top(), bdy.width(), bdy.height()).save("{}_page{}_image{}.png".format(sys.argv[1], i, count))
print("{}_page{}_image{}.png".format(sys.argv[1], i, count))
if annotation.contents():
print("\t - {}".format(annotation.contents()))
if isinstance(annotation, popplerqt5.Poppler.TextAnnotation):
if annotation.contents():
print("\t - {}".format(annotation.contents()))
if total_annotations > 0:
pass
else:
print ("no annotations found")
if __name__ == "__main__":
main()