-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathDemoPdfFromLocalImage.java
executable file
·89 lines (68 loc) · 2.9 KB
/
DemoPdfFromLocalImage.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import com.amazon.textract.pdf.ImageType;
import com.amazon.textract.pdf.PDFDocument;
import com.amazon.textract.pdf.TextLine;
import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractClientBuilder;
import com.amazonaws.services.textract.model.*;
import com.amazonaws.util.IOUtils;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
public class DemoPdfFromLocalImage {
public void run(String documentName, String outputDocumentName) throws IOException {
System.out.println("Generating searchable pdf from: " + documentName);
ImageType imageType = ImageType.JPEG;
if(documentName.toLowerCase().endsWith(".png"))
imageType = ImageType.PNG;
//Get image bytes
ByteBuffer imageBytes = null;
try(InputStream in = new FileInputStream(documentName)) {
imageBytes = ByteBuffer.wrap(IOUtils.toByteArray(in));
}
//Extract text
List<TextLine> lines = extractText(imageBytes);
//Get Image
BufferedImage image = getImage(documentName);
//Create new pdf document
PDFDocument pdfDocument = new PDFDocument();
//Add page with text layer and image in the pdf document
pdfDocument.addPage(image, imageType, lines);
//Save PDF to local disk
try(OutputStream outputStream = new FileOutputStream(outputDocumentName)) {
pdfDocument.save(outputStream);
pdfDocument.close();
}
System.out.println("Generated searchable pdf: " + outputDocumentName);
}
private BufferedImage getImage(String documentName) throws IOException {
BufferedImage image = null;
try(InputStream in = new FileInputStream(documentName)) {
image = ImageIO.read(in);
}
return image;
}
private List<TextLine> extractText(ByteBuffer imageBytes) {
AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
DetectDocumentTextRequest request = new DetectDocumentTextRequest()
.withDocument(new Document()
.withBytes(imageBytes));
DetectDocumentTextResult result = client.detectDocumentText(request);
List<TextLine> lines = new ArrayList<TextLine>();
List<Block> blocks = result.getBlocks();
BoundingBox boundingBox = null;
for (Block block : blocks) {
if ((block.getBlockType()).equals("LINE")) {
boundingBox = block.getGeometry().getBoundingBox();
lines.add(new TextLine(boundingBox.getLeft(),
boundingBox.getTop(),
boundingBox.getWidth(),
boundingBox.getHeight(),
block.getText()));
}
}
return lines;
}
}