From 5c6fa9f80a689d34a2e7d6b26f2187cca765380b Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 23 Nov 2020 00:19:43 -0500 Subject: [PATCH] First pass on OcrPostProcessor miniOCR code i tried to adapt is not working yet. This needs anther processor in chain that passes (invokes this as child) once per page. The idea is that each processor is Atomic. So this one only deals with a single page. Too tired to keep coding. Let's see if i figure out what is missing from the XML transformation. Probably something stupid on my side. I'm also having now with this trouble reading the value back. Its a string, but PHP on decode things its an object.. gosh --- .../IndexPostProcessorQueueWorker.php | 3 +- .../OcrPostProcessor.php | 350 ++++++++++++++++++ 2 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index ba18063..6d1c715 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -193,7 +193,7 @@ public function processItem($data) { error_log(empty($processed_data)); //@TODO allow a force in case of corrupted key value? Partial output // Extragenous weird data? - if (empty($processed_data) || + if (true || empty($processed_data) || $data->force == TRUE || (!isset($processed_data->checksum) || empty($processed_data->checksum) || @@ -219,6 +219,7 @@ public function processItem($data) { $toindex = new \stdClass(); $toindex->fulltext = $io->output; $toindex->checksum = $data->metadata['checksum']; + error_log(var_export($toindex,true)); $this->keyValue->get($keyvalue_collection)->set($key, $toindex); // Get which indexes have our StrawberryfieldFlavorDatasource enabled! diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php new file mode 100644 index 0000000..1430be0 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -0,0 +1,350 @@ + 'asstructure', + 'mime_type' => ['application/pdf'], + 'path' => '', + 'path_tesseract' => '', + 'arguments' => '', + 'arguments_tesseract' => '', + 'output_type' => 'json', + 'output_destination' => 'subkey', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + 'filepath' => 'Full file paths passed by another processor', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source file this processor needs is fetched'), + '#required' => TRUE + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source files.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + '#required' => TRUE, + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + ]; + $element['path'] = [ + '#type' => 'textfield', + '#title' => $this->t('The system path to the ghostscript (gs) binary that will be executed by this processor.'), + '#default_value' => $this->getConfiguration()['path'], + '#description' => t('A full system path to the gs binary present in the same environment your PHP runs, e.g /usr/bin/gs'), + '#required' => TRUE, + ]; + + $element['arguments'] = [ + '#type' => 'textfield', + '#title' => $this->t('Any additional argument your executable binary requires.'), + '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), + '#required' => TRUE, + ]; + + + $element['path_tesseract'] = [ + '#type' => 'textfield', + '#title' => $this->t('The system path to the Tesseract binary that will be executed by this processor.'), + '#default_value' => $this->getConfiguration()['path_tesseract'], + '#description' => t('A full system path to the Tesseract binary present in the same environment your PHP runs, e.g /usr/bin/tesseract'), + '#required' => TRUE, + ]; + + $element['arguments_tesseract'] = [ + '#type' => 'textfield', + '#title' => $this->t('Any additional argument for your tesseract binary.'), + '#default_value' => !empty($this->getConfiguration()['arguments_tesseract']) ? $this->getConfiguration()['arguments_tesseract'] : '%file', + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file that is output but the GS binary.'), + '#required' => TRUE, + ]; + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'entity:file' => 'One or more Files', + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', + 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', + 'plugin' => 'As Input for another processor Plugin', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#required' => TRUE, + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 2, + '#maxlength' => 2, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + // Specific input key as defined in the annotation + // In this case it will contain an absolute Path to a File. + // Needed since this executes locally on the server via SHELL. + + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + error_log('run OCR'); + + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + $pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $execstring = $this->buildExecutableCommand($io); + error_log($execstring); + if ($execstring) { + $backup_locale = setlocale(LC_CTYPE, '0'); + setlocale(LC_CTYPE, $backup_locale); + // Support UTF-8 commands. + // @see http://www.php.net/manual/en/function.shell-exec.php#85095 + shell_exec("LANG=en_US.utf-8"); + $output = $this->proc_execute($execstring, $timeout); + if (is_null($output)) { + throw new \Exception("Could not execute {$execstring} or timed out"); + } + + $miniocr = $this->hOCRtoMiniOCR($output, $pageid); + error_log($miniocr); + $io->output = $miniocr; + } + } else { + \throwException(new \InvalidArgumentException); + } + } + + /** + * Builds a clean Command string using a File path. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * + * @return null|string + */ + public function buildExecutableCommand(\stdClass $io) { + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + // Sets the default page to 1 if not passed. + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + $config = $this->getConfiguration(); + $execpath_gs = $config['path']; + $arguments_gs = $config['arguments']; + $execpath_tesseract = $config['path_tesseract']; + $arguments_tesseract = $config['arguments_tesseract']; + + if (empty($file_path)) { + return NULL; + } + + // This run function executes a 2 step function + //-- with r300 == 300dpi, should be configurable, etc. All should be configurable + // First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file + + $command = ''; + $can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs); + $can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract); + $filename = pathinfo($file_path, PATHINFO_FILENAME); + $sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME); + $sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/'; + $gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png"; + if ($can_run_gs && + $can_run_tesseract && + (strpos($arguments_gs, '%file' ) !== FALSE) && + (strpos($arguments_tesseract, '%file' ) !== FALSE)) { + $arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs; + $arguments_gs = str_replace('%s','', $arguments_gs); + $arguments_gs = str_replace_first('%file','%s', $arguments_gs); + $arguments_gs = sprintf($arguments_gs, $file_path); + + $arguments_tesseract = str_replace('%s','', $arguments_tesseract); + $arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract); + $arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename); + + $command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs); + $command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract); + + $command = $command_gs.' && '.$command_tesseract; + + } else { + error_log("missing arguments for OCR"); + } + // Only return $command if it contains the original filepath somewhere + if (strpos($command, $file_path) !== false) { return $command;} + return ''; + + } + + protected function hOCRtoMiniOCR($output, $pageid) { + error_log($output); + $hocr = simplexml_load_string($output); + $internalErrors = libxml_use_internal_errors(TRUE); + libxml_clear_errors(); + libxml_use_internal_errors($internalErrors); + if (!$hocr) { + error_log('Could not convert HOCR to MiniOCR, sources is not valid XML'); + return NULL; + } + $w = new \XMLWriter(); + $w->openMemory(); + $w->startDocument('1.0','UTF-8'); + $w->startElement("ocr"); + foreach ($hocr->body->children() as $page) { + $coos = explode(" ", substr($page['title'], 5)); + if (count($coos)) { + $w->startElement("p"); + $w->writeAttribute("id", $pageid); + $w->writeAttribute("wh", $coos[2] . " " . $coos[3]); + $w->startElement("b"); + foreach ($page->children() as $line) { + $w->startElement("l"); + foreach ($line->children() as $word) { + $wcoos = explode(" ", $word['title']); + if (count($wcoos)) { + $w->startElement("w"); + $w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]); + error_log($word->__toString()); + $w->text($word->__toString()); + $w->endElement(); + } + } + $w->endElement(); + } + $w->endElement(); + $w->endElement(); + } + } + $w->endElement(); + $w->endDocument(); + unset($hocr); + return $w->outputMemory(true); + } + + + + +}