Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport stable27] Process OCR files by file id #226

Merged
merged 1 commit into from
Aug 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 24 additions & 87 deletions lib/BackgroundJobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@

use \OCP\Files\File;
use OC\User\NoUserException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException;
use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\Service\IEventService;
Expand Down Expand Up @@ -103,19 +101,14 @@ public function __construct(
*/
protected function run($argument) : void {
$this->logger->debug('STARTED -- Run ' . self::class . ' job. Argument: {argument}.', ['argument' => $argument]);

[$success, $filePath, $uid, $settings] = $this->tryParseArguments($argument);
if (!$success) {
$this->notificationService->createErrorNotification($uid, 'Failed to parse arguments inside the OCR process. Please have a look at your servers logfile for more details.');
return;
}

try {
[$fileId, $uid, $settings] = $this->parseArguments($argument);
$this->initUserEnvironment($uid);
$this->processFile($filePath, $settings, $uid);
$this->processFile($fileId, $settings);
} catch (\Throwable $ex) {
$this->logger->error($ex->getMessage(), ['exception' => $ex]);
$this->notificationService->createErrorNotification($uid, 'An error occured while executing the OCR process. Please have a look at your servers logfile for more details.');
$this->notificationService->createErrorNotification($uid, 'An error occured while executing the OCR process ('.$ex->getMessage().'). Please have a look at your servers logfile for more details.');
} finally {
$this->shutdownUserEnvironment();
}
Expand All @@ -126,80 +119,33 @@ protected function run($argument) : void {
/**
* @param mixed $argument
*/
private function tryParseArguments($argument) : array {
private function parseArguments($argument) : array {
if (!is_array($argument)) {
$this->logger->warning('Argument is no array in ' . self::class . ' method \'tryParseArguments\'.');
return [
false
];
}

$filePath = null;
$uid = null;
$filePathKey = 'filePath';
if (array_key_exists($filePathKey, $argument)) {
$filePath = $argument[$filePathKey];
// '', admin, 'files', 'path/to/file.pdf'
$splitted = explode('/', $filePath, 4);
if (count($splitted) < 4) {
$this->logger->warning('File path "' . $filePath . '" is not valid in ' . self::class . ' method \'tryParseArguments\'.');
return [
false
];
}
$uid = $splitted[1];
} else {
$this->logVariableKeyNotSet($filePathKey, 'tryParseArguments');
throw new \InvalidArgumentException('Argument is no array in ' . self::class . ' method \'tryParseArguments\'.');
}

$settings = null;
$settingsKey = 'settings';
if (array_key_exists($settingsKey, $argument)) {
$jsonSettings = $argument[$settingsKey];
$settings = new WorkflowSettings($jsonSettings);
} else {
$this->logVariableKeyNotSet($settingsKey, 'tryParseArguments');
}
$jsonSettings = $argument['settings'];
$settings = new WorkflowSettings($jsonSettings);
$uid = $argument['uid'];
$fileId = intval($argument['fileId']);

return [
$filePath !== null && $uid !== null && $settings !== null,
$filePath,
$fileId,
$uid,
$settings
];
}

/**
* @param string $filePath The file to be processed
* @param int $fileId The id of the file to be processed
* @param WorkflowSettings $settings The settings to be used for processing
* @param string $userId The user who triggered the processing
*/
private function processFile(string $filePath, WorkflowSettings $settings, string $userId) : void {
$node = $this->getNode($filePath, $userId);

if ($node === null) {
return;
}
private function processFile(int $fileId, WorkflowSettings $settings) : void {
$node = $this->getNode($fileId);

$nodeId = $node->getId();

try {
$ocrFile = $this->ocrService->ocrFile($node, $settings);
} catch(\Throwable $throwable) {
if ($throwable instanceof(OcrNotPossibleException::class)) {
$msg = 'OCR for file ' . $node->getPath() . ' not possible. Message: ' . $throwable->getMessage();
} elseif ($throwable instanceof(OcrProcessorNotFoundException::class)) {
$msg = 'OCR processor not found for mimetype ' . $node->getMimeType();
} else {
throw $throwable;
}

$this->logger->error($msg);
$this->notificationService->createErrorNotification($userId, $msg, $nodeId);

return;
}
$ocrFile = $this->ocrService->ocrFile($node, $settings);

$filePath = $node->getPath();
$fileContent = $ocrFile->getFileContent();
$originalFileExtension = $node->getExtension();
$newFileExtension = $ocrFile->getFileExtension();
Expand All @@ -210,28 +156,23 @@ private function processFile(string $filePath, WorkflowSettings $settings, strin
$filePath :
$filePath . ".pdf";

$this->createNewFileVersion($newFilePath, $fileContent, $nodeId);
$this->createNewFileVersion($newFilePath, $fileContent, $fileId);
}

$this->eventService->textRecognized($ocrFile, $node);
}

private function getNode(string $filePath, string $userId) : ?Node {
try {
/** @var File */
$node = $this->rootFolder->get($filePath);
} catch (NotFoundException $nfEx) {
$msg = 'Could not process file \'' . $filePath . '\'. File was not found';
$this->logger->warning($msg);
$this->notificationService->createErrorNotification($userId, $msg);
return null;
private function getNode(int $fileId) : ?Node {
/** @var File[] */
$nodeArr = $this->rootFolder->getById($fileId);
if (count($nodeArr) === 0) {
throw new NotFoundException('Could not process file with id \'' . $fileId . '\'. File was not found');
}

$node = array_shift($nodeArr);

if (!$node instanceof Node || $node->getType() !== FileInfo::TYPE_FILE) {
$msg = 'Skipping process for \'' . $filePath . '\'. It is not a file';
$this->logger->warning($msg);
$this->notificationService->createErrorNotification($userId, $msg);
return null;
throw new \InvalidArgumentException('Skipping process for file with id \'' . $fileId . '\'. It is not a file');
}

return $node;
Expand Down Expand Up @@ -277,8 +218,4 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int
$this->processingFileAccessor->setCurrentlyProcessedFileId(null);
}
}

private function logVariableKeyNotSet(string $key, string $method) : void {
$this->logger->warning("Variable '" . $key . "' not set in " . self::class . " method '" . $method . "'.");
}
}
3 changes: 3 additions & 0 deletions lib/Exception/OcrProcessorNotFoundException.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,7 @@
use Exception;

class OcrProcessorNotFoundException extends Exception {
public function __construct(string $mimeType) {
$this->message = 'OCR processor for mime type ' . $mimeType . ' not found';
}
}
4 changes: 2 additions & 2 deletions lib/OcrProcessors/OcrMyPdfBasedProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$exitCode = $this->command->getExitCode();

if (!$success) {
throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $errorOutput . ' ' . $stdErr);
throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . ' for file ' . $file->getPath() . '. Message: ' . $errorOutput . ' ' . $stdErr);
}

if ($stdErr !== '' || $errorOutput !== '') {
Expand All @@ -86,7 +86,7 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$ocrFileContent = $this->command->getOutput();

if (!$ocrFileContent) {
throw new OcrNotPossibleException('OCRmyPDF did not produce any output');
throw new OcrNotPossibleException('OCRmyPDF did not produce any output for file ' . $file->getPath());
}

$recognizedText = $this->sidecarFileAccessor->getSidecarFileContent();
Expand Down
13 changes: 7 additions & 6 deletions lib/Operation.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ public function onEvent(string $eventName, Event $event, IRuleMatcher $ruleMatch
if (!($match = $this->getMatch($ruleMatcher)) ||
!$this->tryGetFile($eventName, $event, $node) ||
$this->eventTriggeredByOcrProcess($node) ||
!$this->tryGetJobArgs($node, $match, $argsArray)) {
!$this->tryGetJobArgs($node, $match['operation'], $argsArray)) {
return;
}

Expand Down Expand Up @@ -192,23 +192,24 @@ private function getMatch(IRuleMatcher $ruleMatcher) : array {

/**
* @param Node $node
* @param array $match
* @param string $operation
* @param array $argsArray
*/
private function tryGetJobArgs(Node $node, $match, & $argsArray) : bool {
private function tryGetJobArgs(Node $node, $operation, & $argsArray) : bool {
// Check path has valid structure
$filePath = $node->getPath();
// '', admin, 'files', 'path/to/file.pdf'
[,, $folder,] = explode('/', $filePath, 4);
[, $user, $folder,] = explode('/', $filePath, 4);
if ($folder !== 'files') {
$this->logger->debug('Not processing event because path \'{path}\' seems to be invalid.',
['path' => $filePath]);
return false;
}

$argsArray = [
'filePath' => $filePath,
'settings' => $match['operation']
'uid' => $user,
'fileId' => $node->getId(),
'settings' => $operation
];

return true;
Expand Down
10 changes: 5 additions & 5 deletions tests/Integration/Notification/NotificationTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,17 @@ protected function setUp() : void {

$this->processFileJob->setId(111);
$this->processFileJob->setArgument([
'filePath' => '/admin/files/somefile.pdf',
'fileId' => 42,
'uid' => 'someuser',
'settings' => '{}'
]);
}

public function testBackgroundJobCreatesErrorNotificationIfOcrFailed() {
$fileMock = $this->createValidFileMock();
$this->rootFolder->method('get')
->with('/admin/files/somefile.pdf')
->willReturn($fileMock);
$this->rootFolder->method('getById')
->with(42)
->willReturn([$fileMock]);

$this->ocrService->expects($this->once())
->method('ocrFile')
Expand All @@ -177,7 +177,7 @@ public function testBackgroundJobCreatesErrorNotificationIfOcrFailed() {
$notification = $notifications[0];
$this->assertEquals('workflow_ocr', $notification->getApp());
$this->assertEquals('ocr_error', $notification->getSubject());
$this->assertEquals('OCR for file /admin/files/somefile.pdf not possible. Message: Some error', $notification->getSubjectParameters()['message']);
$this->assertEquals('An error occured while executing the OCR process (Some error). Please have a look at your servers logfile for more details.', $notification->getSubjectParameters()['message']);
}

/**
Expand Down
Loading
Loading