diff --git a/README.md b/README.md index 70053ef..f8fcc3b 100644 --- a/README.md +++ b/README.md @@ -426,6 +426,28 @@ return [ TEXTRACT_DISK="uploads" ``` +### Delete the files after processing them with Textract + +#### Using S3 Lifecycle Rules + +You can configure a lifecycle rule on your S3 bucket to delete the files after a certain amount of time, see the AWS +docs for more info: + +https://repost.aws/knowledge-center/s3-empty-bucket-lifecycle-rule + +#### Using the `cleanupFileUsing` hook + +By default, the package will __NOT__ delete the files that has been uploaded in the textract S3 bucket, if you want to +delete these files, you can implement this using the `TextractUsingS3Upload::cleanupFileUsing(Closure)` hook. + +```php +// Delete the file from the S3 bucket +TextractUsingS3Upload::cleanupFileUsing(function (string $filePath) { + Storage::disk('textract')->delete($filePath); +} +``` + + **Note** Textract is not available in all regions: diff --git a/composer.json b/composer.json index 4cf88ac..58026a5 100644 --- a/composer.json +++ b/composer.json @@ -34,6 +34,7 @@ }, "require-dev": { "laravel/pint": "^1.0", + "mockery/mockery": "^1.6", "nunomaduro/collision": "^7.8", "orchestra/testbench": "^8.8", "pestphp/pest": "^2.0", diff --git a/src/Text/Loaders/Textract/TextractUsingS3Upload.php b/src/Text/Loaders/Textract/TextractUsingS3Upload.php index 82805ad..679e744 100644 --- a/src/Text/Loaders/Textract/TextractUsingS3Upload.php +++ b/src/Text/Loaders/Textract/TextractUsingS3Upload.php @@ -18,10 +18,17 @@ class TextractUsingS3Upload implements TextLoader { protected static mixed $generateFilePathUsing = null; + protected static mixed $cleanupFileUsing = null; + public function __construct(protected TextractService $textractService) { } + public static function cleanupFileUsing(callable $callback): void + { + static::$cleanupFileUsing = $callback; + } + public static function generateFilePathUsing(callable $callback): void { static::$generateFilePathUsing = $callback; @@ -41,6 +48,16 @@ public function getFilePath(): string return $this->defaultFilePathGenerator(); } + /** @noinspection PhpInconsistentReturnPointsInspection */ + public function cleanup(string $path) + { + if (static::$cleanupFileUsing) { + return (static::$cleanupFileUsing)($path); + } + + // No cleanup by default + } + /** * @throws TextractTimedOut * @throws TextractConfigNotFoundException @@ -70,12 +87,14 @@ public function load(mixed $data): ?TextContent throw new TextractStorageException("Could not create the file in the textract s3 bucket with path '{$path}'."); } - return new TextContent( - $this->textractService->s3ObjectToText( - s3Object: new S3Object(bucket: $bucket, name: $path), - timeoutInSeconds: config('extractor.textract_timeout'), - pollingIntervalInSeconds: config('extractor.textract_polling_interval') - ) + $result = $this->textractService->s3ObjectToText( + s3Object: new S3Object(bucket: $bucket, name: $path), + timeoutInSeconds: config('extractor.textract_timeout'), + pollingIntervalInSeconds: config('extractor.textract_polling_interval') ); + + $this->cleanup($path); + + return new TextContent($result); } } diff --git a/tests/Text/TextLoaderTest.php b/tests/Text/TextLoaderTest.php index 703ee1a..ce72190 100644 --- a/tests/Text/TextLoaderTest.php +++ b/tests/Text/TextLoaderTest.php @@ -1,7 +1,10 @@ shouldReceive('s3ObjectToText'); + + $textractUsingS3Upload = new TextractUsingS3Upload($mock); + + $testFilePath = 'extractor/test-file.pdf'; + + Storage::disk('s3')->put($testFilePath, 'Test content'); + + Storage::disk('s3')->assertExists($testFilePath); + + TextractUsingS3Upload::cleanupFileUsing(function ($path) { + Storage::disk('s3')->delete($path); + }); + + $textractUsingS3Upload->cleanup($testFilePath); + + Storage::disk('s3')->assertMissing($testFilePath); +}); + +it('overrides the default file path generation with a custom callback', function () { + Storage::fake('s3'); + + $mock = Mockery::mock(TextractService::class); + $mock->shouldReceive('s3ObjectToText'); + + $textractUsingS3Upload = new TextractUsingS3Upload($mock); + + $customFilePath = 'custom-path/custom-file.pdf'; + + TextractUsingS3Upload::generateFilePathUsing(function () use ($customFilePath) { + return $customFilePath; + }); + + expect($textractUsingS3Upload->getFilePath())->toBe($customFilePath); + + Storage::disk('s3')->put($textractUsingS3Upload->getFilePath(), 'Test content'); + + Storage::disk('s3')->assertExists($customFilePath); +});