Skip to content

Commit

Permalink
Process OCR files by file id (#225) (#226)
Browse files Browse the repository at this point in the history
Closing #112

Signed-off-by: Robin Windey <[email protected]>
  • Loading branch information
R0Wi authored Aug 27, 2023
1 parent 14eac7c commit 8a6387f
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 156 deletions.
111 changes: 24 additions & 87 deletions lib/BackgroundJobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@

use \OCP\Files\File;
use OC\User\NoUserException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException;
use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\Service\IEventService;
Expand Down Expand Up @@ -103,19 +101,14 @@ public function __construct(
*/
protected function run($argument) : void {
$this->logger->debug('STARTED -- Run ' . self::class . ' job. Argument: {argument}.', ['argument' => $argument]);

[$success, $filePath, $uid, $settings] = $this->tryParseArguments($argument);
if (!$success) {
$this->notificationService->createErrorNotification($uid, 'Failed to parse arguments inside the OCR process. Please have a look at your servers logfile for more details.');
return;
}

try {
[$fileId, $uid, $settings] = $this->parseArguments($argument);
$this->initUserEnvironment($uid);
$this->processFile($filePath, $settings, $uid);
$this->processFile($fileId, $settings);
} catch (\Throwable $ex) {
$this->logger->error($ex->getMessage(), ['exception' => $ex]);
$this->notificationService->createErrorNotification($uid, 'An error occured while executing the OCR process. Please have a look at your servers logfile for more details.');
$this->notificationService->createErrorNotification($uid, 'An error occured while executing the OCR process ('.$ex->getMessage().'). Please have a look at your servers logfile for more details.');
} finally {
$this->shutdownUserEnvironment();
}
Expand All @@ -126,80 +119,33 @@ protected function run($argument) : void {
/**
* @param mixed $argument
*/
private function tryParseArguments($argument) : array {
private function parseArguments($argument) : array {
if (!is_array($argument)) {
$this->logger->warning('Argument is no array in ' . self::class . ' method \'tryParseArguments\'.');
return [
false
];
}

$filePath = null;
$uid = null;
$filePathKey = 'filePath';
if (array_key_exists($filePathKey, $argument)) {
$filePath = $argument[$filePathKey];
// '', admin, 'files', 'path/to/file.pdf'
$splitted = explode('/', $filePath, 4);
if (count($splitted) < 4) {
$this->logger->warning('File path "' . $filePath . '" is not valid in ' . self::class . ' method \'tryParseArguments\'.');
return [
false
];
}
$uid = $splitted[1];
} else {
$this->logVariableKeyNotSet($filePathKey, 'tryParseArguments');
throw new \InvalidArgumentException('Argument is no array in ' . self::class . ' method \'tryParseArguments\'.');
}

$settings = null;
$settingsKey = 'settings';
if (array_key_exists($settingsKey, $argument)) {
$jsonSettings = $argument[$settingsKey];
$settings = new WorkflowSettings($jsonSettings);
} else {
$this->logVariableKeyNotSet($settingsKey, 'tryParseArguments');
}
$jsonSettings = $argument['settings'];
$settings = new WorkflowSettings($jsonSettings);
$uid = $argument['uid'];
$fileId = intval($argument['fileId']);

return [
$filePath !== null && $uid !== null && $settings !== null,
$filePath,
$fileId,
$uid,
$settings
];
}

/**
* @param string $filePath The file to be processed
* @param int $fileId The id of the file to be processed
* @param WorkflowSettings $settings The settings to be used for processing
* @param string $userId The user who triggered the processing
*/
private function processFile(string $filePath, WorkflowSettings $settings, string $userId) : void {
$node = $this->getNode($filePath, $userId);

if ($node === null) {
return;
}
private function processFile(int $fileId, WorkflowSettings $settings) : void {
$node = $this->getNode($fileId);

$nodeId = $node->getId();

try {
$ocrFile = $this->ocrService->ocrFile($node, $settings);
} catch(\Throwable $throwable) {
if ($throwable instanceof(OcrNotPossibleException::class)) {
$msg = 'OCR for file ' . $node->getPath() . ' not possible. Message: ' . $throwable->getMessage();
} elseif ($throwable instanceof(OcrProcessorNotFoundException::class)) {
$msg = 'OCR processor not found for mimetype ' . $node->getMimeType();
} else {
throw $throwable;
}

$this->logger->error($msg);
$this->notificationService->createErrorNotification($userId, $msg, $nodeId);

return;
}
$ocrFile = $this->ocrService->ocrFile($node, $settings);

$filePath = $node->getPath();
$fileContent = $ocrFile->getFileContent();
$originalFileExtension = $node->getExtension();
$newFileExtension = $ocrFile->getFileExtension();
Expand All @@ -210,28 +156,23 @@ private function processFile(string $filePath, WorkflowSettings $settings, strin
$filePath :
$filePath . ".pdf";

$this->createNewFileVersion($newFilePath, $fileContent, $nodeId);
$this->createNewFileVersion($newFilePath, $fileContent, $fileId);
}

$this->eventService->textRecognized($ocrFile, $node);
}

private function getNode(string $filePath, string $userId) : ?Node {
try {
/** @var File */
$node = $this->rootFolder->get($filePath);
} catch (NotFoundException $nfEx) {
$msg = 'Could not process file \'' . $filePath . '\'. File was not found';
$this->logger->warning($msg);
$this->notificationService->createErrorNotification($userId, $msg);
return null;
private function getNode(int $fileId) : ?Node {
/** @var File[] */
$nodeArr = $this->rootFolder->getById($fileId);
if (count($nodeArr) === 0) {
throw new NotFoundException('Could not process file with id \'' . $fileId . '\'. File was not found');
}

$node = array_shift($nodeArr);

if (!$node instanceof Node || $node->getType() !== FileInfo::TYPE_FILE) {
$msg = 'Skipping process for \'' . $filePath . '\'. It is not a file';
$this->logger->warning($msg);
$this->notificationService->createErrorNotification($userId, $msg);
return null;
throw new \InvalidArgumentException('Skipping process for file with id \'' . $fileId . '\'. It is not a file');
}

return $node;
Expand Down Expand Up @@ -277,8 +218,4 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int
$this->processingFileAccessor->setCurrentlyProcessedFileId(null);
}
}

private function logVariableKeyNotSet(string $key, string $method) : void {
$this->logger->warning("Variable '" . $key . "' not set in " . self::class . " method '" . $method . "'.");
}
}
3 changes: 3 additions & 0 deletions lib/Exception/OcrProcessorNotFoundException.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,7 @@
use Exception;

class OcrProcessorNotFoundException extends Exception {
public function __construct(string $mimeType) {
$this->message = 'OCR processor for mime type ' . $mimeType . ' not found';
}
}
4 changes: 2 additions & 2 deletions lib/OcrProcessors/OcrMyPdfBasedProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$exitCode = $this->command->getExitCode();

if (!$success) {
throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $errorOutput . ' ' . $stdErr);
throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . ' for file ' . $file->getPath() . '. Message: ' . $errorOutput . ' ' . $stdErr);
}

if ($stdErr !== '' || $errorOutput !== '') {
Expand All @@ -86,7 +86,7 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$ocrFileContent = $this->command->getOutput();

if (!$ocrFileContent) {
throw new OcrNotPossibleException('OCRmyPDF did not produce any output');
throw new OcrNotPossibleException('OCRmyPDF did not produce any output for file ' . $file->getPath());
}

$recognizedText = $this->sidecarFileAccessor->getSidecarFileContent();
Expand Down
13 changes: 7 additions & 6 deletions lib/Operation.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ public function onEvent(string $eventName, Event $event, IRuleMatcher $ruleMatch
if (!($match = $this->getMatch($ruleMatcher)) ||
!$this->tryGetFile($eventName, $event, $node) ||
$this->eventTriggeredByOcrProcess($node) ||
!$this->tryGetJobArgs($node, $match, $argsArray)) {
!$this->tryGetJobArgs($node, $match['operation'], $argsArray)) {
return;
}

Expand Down Expand Up @@ -192,23 +192,24 @@ private function getMatch(IRuleMatcher $ruleMatcher) : array {

/**
* @param Node $node
* @param array $match
* @param string $operation
* @param array $argsArray
*/
private function tryGetJobArgs(Node $node, $match, & $argsArray) : bool {
private function tryGetJobArgs(Node $node, $operation, & $argsArray) : bool {
// Check path has valid structure
$filePath = $node->getPath();
// '', admin, 'files', 'path/to/file.pdf'
[,, $folder,] = explode('/', $filePath, 4);
[, $user, $folder,] = explode('/', $filePath, 4);
if ($folder !== 'files') {
$this->logger->debug('Not processing event because path \'{path}\' seems to be invalid.',
['path' => $filePath]);
return false;
}

$argsArray = [
'filePath' => $filePath,
'settings' => $match['operation']
'uid' => $user,
'fileId' => $node->getId(),
'settings' => $operation
];

return true;
Expand Down
10 changes: 5 additions & 5 deletions tests/Integration/Notification/NotificationTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,17 @@ protected function setUp() : void {

$this->processFileJob->setId(111);
$this->processFileJob->setArgument([
'filePath' => '/admin/files/somefile.pdf',
'fileId' => 42,
'uid' => 'someuser',
'settings' => '{}'
]);
}

public function testBackgroundJobCreatesErrorNotificationIfOcrFailed() {
$fileMock = $this->createValidFileMock();
$this->rootFolder->method('get')
->with('/admin/files/somefile.pdf')
->willReturn($fileMock);
$this->rootFolder->method('getById')
->with(42)
->willReturn([$fileMock]);

$this->ocrService->expects($this->once())
->method('ocrFile')
Expand All @@ -177,7 +177,7 @@ public function testBackgroundJobCreatesErrorNotificationIfOcrFailed() {
$notification = $notifications[0];
$this->assertEquals('workflow_ocr', $notification->getApp());
$this->assertEquals('ocr_error', $notification->getSubject());
$this->assertEquals('OCR for file /admin/files/somefile.pdf not possible. Message: Some error', $notification->getSubjectParameters()['message']);
$this->assertEquals('An error occured while executing the OCR process (Some error). Please have a look at your servers logfile for more details.', $notification->getSubjectParameters()['message']);
}

/**
Expand Down
Loading

0 comments on commit 8a6387f

Please sign in to comment.