|
| 1 | +<?php |
| 2 | +/** |
| 3 | + * Created by PhpStorm. |
| 4 | + * User: dpino |
| 5 | + * Date: 11/11/19 |
| 6 | + * Time: 8:18 PM |
| 7 | + */ |
| 8 | + |
| 9 | +namespace Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor; |
| 10 | + |
| 11 | +use Drupal\Core\Form\FormStateInterface; |
| 12 | +use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\SystemBinaryPostProcessor; |
| 13 | +use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor; |
| 14 | +use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginBase; |
| 15 | +use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; |
| 16 | + |
| 17 | + |
| 18 | +/** |
| 19 | + * |
| 20 | + * System Binary Post processor Plugin Implementation |
| 21 | + * |
| 22 | + * @StrawberryRunnersPostProcessor( |
| 23 | + * id = "ocr", |
| 24 | + * label = @Translation("Post processor that Runs OCR/HORC against files"), |
| 25 | + * input_type = "entity:file", |
| 26 | + * input_property = "filepath", |
| 27 | + * input_argument = "page_number" |
| 28 | + * ) |
| 29 | + */ |
| 30 | +class OcrPostProcessor extends SystemBinaryPostProcessor { |
| 31 | + |
| 32 | + /** |
| 33 | + * {@inheritdoc} |
| 34 | + */ |
| 35 | + public function defaultConfiguration() { |
| 36 | + return [ |
| 37 | + 'source_type' => 'asstructure', |
| 38 | + 'mime_type' => ['application/pdf'], |
| 39 | + 'path' => '', |
| 40 | + 'path_tesseract' => '', |
| 41 | + 'arguments' => '', |
| 42 | + 'arguments_tesseract' => '', |
| 43 | + 'output_type' => 'json', |
| 44 | + 'output_destination' => 'subkey', |
| 45 | + ] + parent::defaultConfiguration(); |
| 46 | + } |
| 47 | + |
| 48 | + |
| 49 | + public function calculateDependencies() { |
| 50 | + // Since Processors could be chained we need to check if any other |
| 51 | + // processor instance is using an instance of this one |
| 52 | + // @TODO: Implement calculateDependencies() method. |
| 53 | + } |
| 54 | + |
| 55 | + public function settingsForm(array $parents, FormStateInterface $form_state) { |
| 56 | + |
| 57 | + $element['source_type'] = [ |
| 58 | + '#type' => 'select', |
| 59 | + '#title' => $this->t('The type of source data this processor works on'), |
| 60 | + '#options' => [ |
| 61 | + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', |
| 62 | + 'filepath' => 'Full file paths passed by another processor', |
| 63 | + ], |
| 64 | + '#default_value' => $this->getConfiguration()['source_type'], |
| 65 | + '#description' => $this->t('Select from where the source file this processor needs is fetched'), |
| 66 | + '#required' => TRUE |
| 67 | + ]; |
| 68 | + |
| 69 | + $element['ado_type'] = [ |
| 70 | + '#type' => 'textfield', |
| 71 | + '#title' => $this->t('ADO type(s) to limit this processor to.'), |
| 72 | + '#default_value' => $this->getConfiguration()['ado_type'], |
| 73 | + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), |
| 74 | + ]; |
| 75 | + |
| 76 | + $element['jsonkey'] = [ |
| 77 | + '#type' => 'checkboxes', |
| 78 | + '#title' => $this->t('The JSON key that contains the desired source files.'), |
| 79 | + '#options' => [ |
| 80 | + 'as:image' => 'as:image', |
| 81 | + 'as:document' => 'as:document', |
| 82 | + 'as:audio' => 'as:audio', |
| 83 | + 'as:video' => 'as:video', |
| 84 | + 'as:text' => 'as:text', |
| 85 | + 'as:application' => 'as:application', |
| 86 | + ], |
| 87 | + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], |
| 88 | + '#states' => [ |
| 89 | + 'visible' => [ |
| 90 | + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], |
| 91 | + ], |
| 92 | + ], |
| 93 | + '#required' => TRUE, |
| 94 | + ]; |
| 95 | + |
| 96 | + $element['mime_type'] = [ |
| 97 | + '#type' => 'textfield', |
| 98 | + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), |
| 99 | + '#default_value' => $this->getConfiguration()['mime_type'], |
| 100 | + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), |
| 101 | + ]; |
| 102 | + $element['path'] = [ |
| 103 | + '#type' => 'textfield', |
| 104 | + '#title' => $this->t('The system path to the ghostscript (gs) binary that will be executed by this processor.'), |
| 105 | + '#default_value' => $this->getConfiguration()['path'], |
| 106 | + '#description' => t('A full system path to the gs binary present in the same environment your PHP runs, e.g <em>/usr/bin/gs</em>'), |
| 107 | + '#required' => TRUE, |
| 108 | + ]; |
| 109 | + |
| 110 | + $element['arguments'] = [ |
| 111 | + '#type' => 'textfield', |
| 112 | + '#title' => $this->t('Any additional argument your executable binary requires.'), |
| 113 | + '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', |
| 114 | + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), |
| 115 | + '#required' => TRUE, |
| 116 | + ]; |
| 117 | + |
| 118 | + |
| 119 | + $element['path_tesseract'] = [ |
| 120 | + '#type' => 'textfield', |
| 121 | + '#title' => $this->t('The system path to the Tesseract binary that will be executed by this processor.'), |
| 122 | + '#default_value' => $this->getConfiguration()['path_tesseract'], |
| 123 | + '#description' => t('A full system path to the Tesseract binary present in the same environment your PHP runs, e.g <em>/usr/bin/tesseract</em>'), |
| 124 | + '#required' => TRUE, |
| 125 | + ]; |
| 126 | + |
| 127 | + $element['arguments_tesseract'] = [ |
| 128 | + '#type' => 'textfield', |
| 129 | + '#title' => $this->t('Any additional argument for your tesseract binary.'), |
| 130 | + '#default_value' => !empty($this->getConfiguration()['arguments_tesseract']) ? $this->getConfiguration()['arguments_tesseract'] : '%file', |
| 131 | + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file that is output but the GS binary.'), |
| 132 | + '#required' => TRUE, |
| 133 | + ]; |
| 134 | + |
| 135 | + $element['output_type'] = [ |
| 136 | + '#type' => 'select', |
| 137 | + '#title' => $this->t('The expected and desired output of this processor.'), |
| 138 | + '#options' => [ |
| 139 | + 'entity:file' => 'One or more Files', |
| 140 | + 'json' => 'Data/Values that can be serialized to JSON', |
| 141 | + ], |
| 142 | + '#default_value' => $this->getConfiguration()['output_type'], |
| 143 | + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), |
| 144 | + ]; |
| 145 | + |
| 146 | + $element['output_destination'] = [ |
| 147 | + '#type' => 'checkboxes', |
| 148 | + '#title' => $this->t("Where and how the output will be used."), |
| 149 | + '#options' => [ |
| 150 | + 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', |
| 151 | + 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', |
| 152 | + 'plugin' => 'As Input for another processor Plugin', |
| 153 | + ], |
| 154 | + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], |
| 155 | + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), |
| 156 | + '#required' => TRUE, |
| 157 | + ]; |
| 158 | + |
| 159 | + $element['timeout'] = [ |
| 160 | + '#type' => 'number', |
| 161 | + '#title' => $this->t('Timeout in seconds for this process.'), |
| 162 | + '#default_value' => $this->getConfiguration()['timeout'], |
| 163 | + '#description' => $this->t('If the process runs out of time it can still be processed again.'), |
| 164 | + '#size' => 2, |
| 165 | + '#maxlength' => 2, |
| 166 | + '#min' => 1, |
| 167 | + ]; |
| 168 | + $element['weight'] = [ |
| 169 | + '#type' => 'number', |
| 170 | + '#title' => $this->t('Order or execution in the global chain.'), |
| 171 | + '#default_value' => $this->getConfiguration()['weight'], |
| 172 | + ]; |
| 173 | + |
| 174 | + return $element; |
| 175 | + } |
| 176 | + |
| 177 | + |
| 178 | + |
| 179 | + public function onDependencyRemoval(array $dependencies) { |
| 180 | + // Since Processors could be chained we need to check if any other |
| 181 | + // processor instance is using an instance of this one |
| 182 | + return parent::onDependencyRemoval( |
| 183 | + $dependencies |
| 184 | + ); // TODO: Change the autogenerated stub |
| 185 | + } |
| 186 | + |
| 187 | + /** |
| 188 | + * Executes the logic of this plugin given a file path and a context. |
| 189 | + * |
| 190 | + * @param \stdClass $io |
| 191 | + * $io->input needs to contain |
| 192 | + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property |
| 193 | + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments |
| 194 | + * $io->output will contain the result of the processor |
| 195 | + * @param string $context |
| 196 | + */ |
| 197 | + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { |
| 198 | + // Specific input key as defined in the annotation |
| 199 | + // In this case it will contain an absolute Path to a File. |
| 200 | + // Needed since this executes locally on the server via SHELL. |
| 201 | + |
| 202 | + $input_property = $this->pluginDefinition['input_property']; |
| 203 | + $input_argument = $this->pluginDefinition['input_argument']; |
| 204 | + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; |
| 205 | + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; |
| 206 | + $config = $this->getConfiguration(); |
| 207 | + $timeout = $config['timeout']; // in seconds |
| 208 | + error_log('run OCR'); |
| 209 | + |
| 210 | + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { |
| 211 | + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} |
| 212 | + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; |
| 213 | + $pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number; |
| 214 | + setlocale(LC_CTYPE, 'en_US.UTF-8'); |
| 215 | + $execstring = $this->buildExecutableCommand($io); |
| 216 | + error_log($execstring); |
| 217 | + if ($execstring) { |
| 218 | + $backup_locale = setlocale(LC_CTYPE, '0'); |
| 219 | + setlocale(LC_CTYPE, $backup_locale); |
| 220 | + // Support UTF-8 commands. |
| 221 | + // @see http://www.php.net/manual/en/function.shell-exec.php#85095 |
| 222 | + shell_exec("LANG=en_US.utf-8"); |
| 223 | + $output = $this->proc_execute($execstring, $timeout); |
| 224 | + if (is_null($output)) { |
| 225 | + throw new \Exception("Could not execute {$execstring} or timed out"); |
| 226 | + } |
| 227 | + |
| 228 | + $miniocr = $this->hOCRtoMiniOCR($output, $pageid); |
| 229 | + error_log($miniocr); |
| 230 | + $io->output = $miniocr; |
| 231 | + } |
| 232 | + } else { |
| 233 | + \throwException(new \InvalidArgumentException); |
| 234 | + } |
| 235 | + } |
| 236 | + |
| 237 | + /** |
| 238 | + * Builds a clean Command string using a File path. |
| 239 | + * |
| 240 | + * @param \stdClass $io |
| 241 | + * $io->input needs to contain |
| 242 | + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property |
| 243 | + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments |
| 244 | + * $io->output will contain the result of the processor |
| 245 | + * |
| 246 | + * @return null|string |
| 247 | + */ |
| 248 | + public function buildExecutableCommand(\stdClass $io) { |
| 249 | + $input_property = $this->pluginDefinition['input_property']; |
| 250 | + $input_argument = $this->pluginDefinition['input_argument']; |
| 251 | + // Sets the default page to 1 if not passed. |
| 252 | + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; |
| 253 | + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; |
| 254 | + $config = $this->getConfiguration(); |
| 255 | + $execpath_gs = $config['path']; |
| 256 | + $arguments_gs = $config['arguments']; |
| 257 | + $execpath_tesseract = $config['path_tesseract']; |
| 258 | + $arguments_tesseract = $config['arguments_tesseract']; |
| 259 | + |
| 260 | + if (empty($file_path)) { |
| 261 | + return NULL; |
| 262 | + } |
| 263 | + |
| 264 | + // This run function executes a 2 step function |
| 265 | + //-- with r300 == 300dpi, should be configurable, etc. All should be configurable |
| 266 | + // First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file |
| 267 | + |
| 268 | + $command = ''; |
| 269 | + $can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs); |
| 270 | + $can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract); |
| 271 | + $filename = pathinfo($file_path, PATHINFO_FILENAME); |
| 272 | + $sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME); |
| 273 | + $sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/'; |
| 274 | + $gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png"; |
| 275 | + if ($can_run_gs && |
| 276 | + $can_run_tesseract && |
| 277 | + (strpos($arguments_gs, '%file' ) !== FALSE) && |
| 278 | + (strpos($arguments_tesseract, '%file' ) !== FALSE)) { |
| 279 | + $arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs; |
| 280 | + $arguments_gs = str_replace('%s','', $arguments_gs); |
| 281 | + $arguments_gs = str_replace_first('%file','%s', $arguments_gs); |
| 282 | + $arguments_gs = sprintf($arguments_gs, $file_path); |
| 283 | + |
| 284 | + $arguments_tesseract = str_replace('%s','', $arguments_tesseract); |
| 285 | + $arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract); |
| 286 | + $arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename); |
| 287 | + |
| 288 | + $command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs); |
| 289 | + $command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract); |
| 290 | + |
| 291 | + $command = $command_gs.' && '.$command_tesseract; |
| 292 | + |
| 293 | + } else { |
| 294 | + error_log("missing arguments for OCR"); |
| 295 | + } |
| 296 | + // Only return $command if it contains the original filepath somewhere |
| 297 | + if (strpos($command, $file_path) !== false) { return $command;} |
| 298 | + return ''; |
| 299 | + |
| 300 | + } |
| 301 | + |
| 302 | + protected function hOCRtoMiniOCR($output, $pageid) { |
| 303 | + error_log($output); |
| 304 | + $hocr = simplexml_load_string($output); |
| 305 | + $internalErrors = libxml_use_internal_errors(TRUE); |
| 306 | + libxml_clear_errors(); |
| 307 | + libxml_use_internal_errors($internalErrors); |
| 308 | + if (!$hocr) { |
| 309 | + error_log('Could not convert HOCR to MiniOCR, sources is not valid XML'); |
| 310 | + return NULL; |
| 311 | + } |
| 312 | + $w = new \XMLWriter(); |
| 313 | + $w->openMemory(); |
| 314 | + $w->startDocument('1.0','UTF-8'); |
| 315 | + $w->startElement("ocr"); |
| 316 | + foreach ($hocr->body->children() as $page) { |
| 317 | + $coos = explode(" ", substr($page['title'], 5)); |
| 318 | + if (count($coos)) { |
| 319 | + $w->startElement("p"); |
| 320 | + $w->writeAttribute("id", $pageid); |
| 321 | + $w->writeAttribute("wh", $coos[2] . " " . $coos[3]); |
| 322 | + $w->startElement("b"); |
| 323 | + foreach ($page->children() as $line) { |
| 324 | + $w->startElement("l"); |
| 325 | + foreach ($line->children() as $word) { |
| 326 | + $wcoos = explode(" ", $word['title']); |
| 327 | + if (count($wcoos)) { |
| 328 | + $w->startElement("w"); |
| 329 | + $w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]); |
| 330 | + error_log($word->__toString()); |
| 331 | + $w->text($word->__toString()); |
| 332 | + $w->endElement(); |
| 333 | + } |
| 334 | + } |
| 335 | + $w->endElement(); |
| 336 | + } |
| 337 | + $w->endElement(); |
| 338 | + $w->endElement(); |
| 339 | + } |
| 340 | + } |
| 341 | + $w->endElement(); |
| 342 | + $w->endDocument(); |
| 343 | + unset($hocr); |
| 344 | + return $w->outputMemory(true); |
| 345 | + } |
| 346 | + |
| 347 | + |
| 348 | + |
| 349 | + |
| 350 | +} |
0 commit comments