Skip to content

Commit 5c6fa9f

Browse files
committed
First pass on OcrPostProcessor
miniOCR code i tried to adapt is not working yet. This needs anther processor in chain that passes (invokes this as child) once per page. The idea is that each processor is Atomic. So this one only deals with a single page. Too tired to keep coding. Let's see if i figure out what is missing from the XML transformation. Probably something stupid on my side. I'm also having now with this trouble reading the value back. Its a string, but PHP on decode things its an object.. gosh
1 parent e144f71 commit 5c6fa9f

File tree

2 files changed

+352
-1
lines changed

2 files changed

+352
-1
lines changed

src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ public function processItem($data) {
193193
error_log(empty($processed_data));
194194
//@TODO allow a force in case of corrupted key value? Partial output
195195
// Extragenous weird data?
196-
if (empty($processed_data) ||
196+
if (true || empty($processed_data) ||
197197
$data->force == TRUE ||
198198
(!isset($processed_data->checksum) ||
199199
empty($processed_data->checksum) ||
@@ -219,6 +219,7 @@ public function processItem($data) {
219219
$toindex = new \stdClass();
220220
$toindex->fulltext = $io->output;
221221
$toindex->checksum = $data->metadata['checksum'];
222+
error_log(var_export($toindex,true));
222223
$this->keyValue->get($keyvalue_collection)->set($key, $toindex);
223224

224225
// Get which indexes have our StrawberryfieldFlavorDatasource enabled!
Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
<?php
2+
/**
3+
* Created by PhpStorm.
4+
* User: dpino
5+
* Date: 11/11/19
6+
* Time: 8:18 PM
7+
*/
8+
9+
namespace Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor;
10+
11+
use Drupal\Core\Form\FormStateInterface;
12+
use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\SystemBinaryPostProcessor;
13+
use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor;
14+
use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginBase;
15+
use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface;
16+
17+
18+
/**
19+
*
20+
* System Binary Post processor Plugin Implementation
21+
*
22+
* @StrawberryRunnersPostProcessor(
23+
* id = "ocr",
24+
* label = @Translation("Post processor that Runs OCR/HORC against files"),
25+
* input_type = "entity:file",
26+
* input_property = "filepath",
27+
* input_argument = "page_number"
28+
* )
29+
*/
30+
class OcrPostProcessor extends SystemBinaryPostProcessor {
31+
32+
/**
33+
* {@inheritdoc}
34+
*/
35+
public function defaultConfiguration() {
36+
return [
37+
'source_type' => 'asstructure',
38+
'mime_type' => ['application/pdf'],
39+
'path' => '',
40+
'path_tesseract' => '',
41+
'arguments' => '',
42+
'arguments_tesseract' => '',
43+
'output_type' => 'json',
44+
'output_destination' => 'subkey',
45+
] + parent::defaultConfiguration();
46+
}
47+
48+
49+
public function calculateDependencies() {
50+
// Since Processors could be chained we need to check if any other
51+
// processor instance is using an instance of this one
52+
// @TODO: Implement calculateDependencies() method.
53+
}
54+
55+
public function settingsForm(array $parents, FormStateInterface $form_state) {
56+
57+
$element['source_type'] = [
58+
'#type' => 'select',
59+
'#title' => $this->t('The type of source data this processor works on'),
60+
'#options' => [
61+
'asstructure' => 'File entities referenced in the as:filetype JSON structure',
62+
'filepath' => 'Full file paths passed by another processor',
63+
],
64+
'#default_value' => $this->getConfiguration()['source_type'],
65+
'#description' => $this->t('Select from where the source file this processor needs is fetched'),
66+
'#required' => TRUE
67+
];
68+
69+
$element['ado_type'] = [
70+
'#type' => 'textfield',
71+
'#title' => $this->t('ADO type(s) to limit this processor to.'),
72+
'#default_value' => $this->getConfiguration()['ado_type'],
73+
'#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'),
74+
];
75+
76+
$element['jsonkey'] = [
77+
'#type' => 'checkboxes',
78+
'#title' => $this->t('The JSON key that contains the desired source files.'),
79+
'#options' => [
80+
'as:image' => 'as:image',
81+
'as:document' => 'as:document',
82+
'as:audio' => 'as:audio',
83+
'as:video' => 'as:video',
84+
'as:text' => 'as:text',
85+
'as:application' => 'as:application',
86+
],
87+
'#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [],
88+
'#states' => [
89+
'visible' => [
90+
':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'],
91+
],
92+
],
93+
'#required' => TRUE,
94+
];
95+
96+
$element['mime_type'] = [
97+
'#type' => 'textfield',
98+
'#title' => $this->t('Mimetypes(s) to limit this Processor to.'),
99+
'#default_value' => $this->getConfiguration()['mime_type'],
100+
'#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'),
101+
];
102+
$element['path'] = [
103+
'#type' => 'textfield',
104+
'#title' => $this->t('The system path to the ghostscript (gs) binary that will be executed by this processor.'),
105+
'#default_value' => $this->getConfiguration()['path'],
106+
'#description' => t('A full system path to the gs binary present in the same environment your PHP runs, e.g <em>/usr/bin/gs</em>'),
107+
'#required' => TRUE,
108+
];
109+
110+
$element['arguments'] = [
111+
'#type' => 'textfield',
112+
'#title' => $this->t('Any additional argument your executable binary requires.'),
113+
'#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file',
114+
'#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'),
115+
'#required' => TRUE,
116+
];
117+
118+
119+
$element['path_tesseract'] = [
120+
'#type' => 'textfield',
121+
'#title' => $this->t('The system path to the Tesseract binary that will be executed by this processor.'),
122+
'#default_value' => $this->getConfiguration()['path_tesseract'],
123+
'#description' => t('A full system path to the Tesseract binary present in the same environment your PHP runs, e.g <em>/usr/bin/tesseract</em>'),
124+
'#required' => TRUE,
125+
];
126+
127+
$element['arguments_tesseract'] = [
128+
'#type' => 'textfield',
129+
'#title' => $this->t('Any additional argument for your tesseract binary.'),
130+
'#default_value' => !empty($this->getConfiguration()['arguments_tesseract']) ? $this->getConfiguration()['arguments_tesseract'] : '%file',
131+
'#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file that is output but the GS binary.'),
132+
'#required' => TRUE,
133+
];
134+
135+
$element['output_type'] = [
136+
'#type' => 'select',
137+
'#title' => $this->t('The expected and desired output of this processor.'),
138+
'#options' => [
139+
'entity:file' => 'One or more Files',
140+
'json' => 'Data/Values that can be serialized to JSON',
141+
],
142+
'#default_value' => $this->getConfiguration()['output_type'],
143+
'#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'),
144+
];
145+
146+
$element['output_destination'] = [
147+
'#type' => 'checkboxes',
148+
'#title' => $this->t("Where and how the output will be used."),
149+
'#options' => [
150+
'subkey' => 'In the same Source Metadata, as a child structure of each Processed file',
151+
'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin',
152+
'plugin' => 'As Input for another processor Plugin',
153+
],
154+
'#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [],
155+
'#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'),
156+
'#required' => TRUE,
157+
];
158+
159+
$element['timeout'] = [
160+
'#type' => 'number',
161+
'#title' => $this->t('Timeout in seconds for this process.'),
162+
'#default_value' => $this->getConfiguration()['timeout'],
163+
'#description' => $this->t('If the process runs out of time it can still be processed again.'),
164+
'#size' => 2,
165+
'#maxlength' => 2,
166+
'#min' => 1,
167+
];
168+
$element['weight'] = [
169+
'#type' => 'number',
170+
'#title' => $this->t('Order or execution in the global chain.'),
171+
'#default_value' => $this->getConfiguration()['weight'],
172+
];
173+
174+
return $element;
175+
}
176+
177+
178+
179+
public function onDependencyRemoval(array $dependencies) {
180+
// Since Processors could be chained we need to check if any other
181+
// processor instance is using an instance of this one
182+
return parent::onDependencyRemoval(
183+
$dependencies
184+
); // TODO: Change the autogenerated stub
185+
}
186+
187+
/**
188+
* Executes the logic of this plugin given a file path and a context.
189+
*
190+
* @param \stdClass $io
191+
* $io->input needs to contain
192+
* \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property
193+
* \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments
194+
* $io->output will contain the result of the processor
195+
* @param string $context
196+
*/
197+
public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) {
198+
// Specific input key as defined in the annotation
199+
// In this case it will contain an absolute Path to a File.
200+
// Needed since this executes locally on the server via SHELL.
201+
202+
$input_property = $this->pluginDefinition['input_property'];
203+
$input_argument = $this->pluginDefinition['input_argument'];
204+
$file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL;
205+
$node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL;
206+
$config = $this->getConfiguration();
207+
$timeout = $config['timeout']; // in seconds
208+
error_log('run OCR');
209+
210+
if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) {
211+
// To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber}
212+
$page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1;
213+
$pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number;
214+
setlocale(LC_CTYPE, 'en_US.UTF-8');
215+
$execstring = $this->buildExecutableCommand($io);
216+
error_log($execstring);
217+
if ($execstring) {
218+
$backup_locale = setlocale(LC_CTYPE, '0');
219+
setlocale(LC_CTYPE, $backup_locale);
220+
// Support UTF-8 commands.
221+
// @see http://www.php.net/manual/en/function.shell-exec.php#85095
222+
shell_exec("LANG=en_US.utf-8");
223+
$output = $this->proc_execute($execstring, $timeout);
224+
if (is_null($output)) {
225+
throw new \Exception("Could not execute {$execstring} or timed out");
226+
}
227+
228+
$miniocr = $this->hOCRtoMiniOCR($output, $pageid);
229+
error_log($miniocr);
230+
$io->output = $miniocr;
231+
}
232+
} else {
233+
\throwException(new \InvalidArgumentException);
234+
}
235+
}
236+
237+
/**
238+
* Builds a clean Command string using a File path.
239+
*
240+
* @param \stdClass $io
241+
* $io->input needs to contain
242+
* \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property
243+
* \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments
244+
* $io->output will contain the result of the processor
245+
*
246+
* @return null|string
247+
*/
248+
public function buildExecutableCommand(\stdClass $io) {
249+
$input_property = $this->pluginDefinition['input_property'];
250+
$input_argument = $this->pluginDefinition['input_argument'];
251+
// Sets the default page to 1 if not passed.
252+
$file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL;
253+
$page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1;
254+
$config = $this->getConfiguration();
255+
$execpath_gs = $config['path'];
256+
$arguments_gs = $config['arguments'];
257+
$execpath_tesseract = $config['path_tesseract'];
258+
$arguments_tesseract = $config['arguments_tesseract'];
259+
260+
if (empty($file_path)) {
261+
return NULL;
262+
}
263+
264+
// This run function executes a 2 step function
265+
//-- with r300 == 300dpi, should be configurable, etc. All should be configurable
266+
// First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file
267+
268+
$command = '';
269+
$can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs);
270+
$can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract);
271+
$filename = pathinfo($file_path, PATHINFO_FILENAME);
272+
$sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME);
273+
$sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/';
274+
$gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png";
275+
if ($can_run_gs &&
276+
$can_run_tesseract &&
277+
(strpos($arguments_gs, '%file' ) !== FALSE) &&
278+
(strpos($arguments_tesseract, '%file' ) !== FALSE)) {
279+
$arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs;
280+
$arguments_gs = str_replace('%s','', $arguments_gs);
281+
$arguments_gs = str_replace_first('%file','%s', $arguments_gs);
282+
$arguments_gs = sprintf($arguments_gs, $file_path);
283+
284+
$arguments_tesseract = str_replace('%s','', $arguments_tesseract);
285+
$arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract);
286+
$arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename);
287+
288+
$command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs);
289+
$command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract);
290+
291+
$command = $command_gs.' && '.$command_tesseract;
292+
293+
} else {
294+
error_log("missing arguments for OCR");
295+
}
296+
// Only return $command if it contains the original filepath somewhere
297+
if (strpos($command, $file_path) !== false) { return $command;}
298+
return '';
299+
300+
}
301+
302+
protected function hOCRtoMiniOCR($output, $pageid) {
303+
error_log($output);
304+
$hocr = simplexml_load_string($output);
305+
$internalErrors = libxml_use_internal_errors(TRUE);
306+
libxml_clear_errors();
307+
libxml_use_internal_errors($internalErrors);
308+
if (!$hocr) {
309+
error_log('Could not convert HOCR to MiniOCR, sources is not valid XML');
310+
return NULL;
311+
}
312+
$w = new \XMLWriter();
313+
$w->openMemory();
314+
$w->startDocument('1.0','UTF-8');
315+
$w->startElement("ocr");
316+
foreach ($hocr->body->children() as $page) {
317+
$coos = explode(" ", substr($page['title'], 5));
318+
if (count($coos)) {
319+
$w->startElement("p");
320+
$w->writeAttribute("id", $pageid);
321+
$w->writeAttribute("wh", $coos[2] . " " . $coos[3]);
322+
$w->startElement("b");
323+
foreach ($page->children() as $line) {
324+
$w->startElement("l");
325+
foreach ($line->children() as $word) {
326+
$wcoos = explode(" ", $word['title']);
327+
if (count($wcoos)) {
328+
$w->startElement("w");
329+
$w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]);
330+
error_log($word->__toString());
331+
$w->text($word->__toString());
332+
$w->endElement();
333+
}
334+
}
335+
$w->endElement();
336+
}
337+
$w->endElement();
338+
$w->endElement();
339+
}
340+
}
341+
$w->endElement();
342+
$w->endDocument();
343+
unset($hocr);
344+
return $w->outputMemory(true);
345+
}
346+
347+
348+
349+
350+
}

0 commit comments

Comments
 (0)