Skip to content

Commit

Permalink
Merge pull request #5 from HelgeSverre/feature/cleanup-textract-s3
Browse files Browse the repository at this point in the history
Add hook to enable deletion of files after processing with Textract
  • Loading branch information
HelgeSverre authored Jan 4, 2024
2 parents 932f097 + 911f66f commit 9bcf76f
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 6 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,28 @@ return [
TEXTRACT_DISK="uploads"
```

### Delete the files after processing them with Textract

#### Using S3 Lifecycle Rules

You can configure a lifecycle rule on your S3 bucket to delete the files after a certain amount of time, see the AWS
docs for more info:

https://repost.aws/knowledge-center/s3-empty-bucket-lifecycle-rule

#### Using the `cleanupFileUsing` hook

By default, the package will __NOT__ delete the files that has been uploaded in the textract S3 bucket, if you want to
delete these files, you can implement this using the `TextractUsingS3Upload::cleanupFileUsing(Closure)` hook.

```php
// Delete the file from the S3 bucket
TextractUsingS3Upload::cleanupFileUsing(function (string $filePath) {
Storage::disk('textract')->delete($filePath);
}
```


**Note**

Textract is not available in all regions:
Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
},
"require-dev": {
"laravel/pint": "^1.0",
"mockery/mockery": "^1.6",
"nunomaduro/collision": "^7.8",
"orchestra/testbench": "^8.8",
"pestphp/pest": "^2.0",
Expand Down
31 changes: 25 additions & 6 deletions src/Text/Loaders/Textract/TextractUsingS3Upload.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@ class TextractUsingS3Upload implements TextLoader
{
protected static mixed $generateFilePathUsing = null;

protected static mixed $cleanupFileUsing = null;

public function __construct(protected TextractService $textractService)
{
}

public static function cleanupFileUsing(callable $callback): void
{
static::$cleanupFileUsing = $callback;
}

public static function generateFilePathUsing(callable $callback): void
{
static::$generateFilePathUsing = $callback;
Expand All @@ -41,6 +48,16 @@ public function getFilePath(): string
return $this->defaultFilePathGenerator();
}

/** @noinspection PhpInconsistentReturnPointsInspection */
public function cleanup(string $path)
{
if (static::$cleanupFileUsing) {
return (static::$cleanupFileUsing)($path);
}

// No cleanup by default
}

/**
* @throws TextractTimedOut
* @throws TextractConfigNotFoundException
Expand Down Expand Up @@ -70,12 +87,14 @@ public function load(mixed $data): ?TextContent
throw new TextractStorageException("Could not create the file in the textract s3 bucket with path '{$path}'.");
}

return new TextContent(
$this->textractService->s3ObjectToText(
s3Object: new S3Object(bucket: $bucket, name: $path),
timeoutInSeconds: config('extractor.textract_timeout'),
pollingIntervalInSeconds: config('extractor.textract_polling_interval')
)
$result = $this->textractService->s3ObjectToText(
s3Object: new S3Object(bucket: $bucket, name: $path),
timeoutInSeconds: config('extractor.textract_timeout'),
pollingIntervalInSeconds: config('extractor.textract_polling_interval')
);

$this->cleanup($path);

return new TextContent($result);
}
}
47 changes: 47 additions & 0 deletions tests/Text/TextLoaderTest.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
<?php

use HelgeSverre\Extractor\Facades\Text;
use HelgeSverre\Extractor\Text\Loaders\Textract\TextractService;
use HelgeSverre\Extractor\Text\Loaders\Textract\TextractUsingS3Upload;
use HelgeSverre\Extractor\Text\TextContent;
use Illuminate\Support\Facades\Storage;

it('Can load Text', function () {
$text = Text::text(file_get_contents(__DIR__.'/../samples/wolt-pizza-norwegian.txt'));
Expand Down Expand Up @@ -94,3 +97,47 @@
'Termination of the Agreement',
);
});

it('removes the file from S3 using the provided cleanup callback', function () {
Storage::fake('s3');

$mock = Mockery::mock(TextractService::class);
$mock->shouldReceive('s3ObjectToText');

$textractUsingS3Upload = new TextractUsingS3Upload($mock);

$testFilePath = 'extractor/test-file.pdf';

Storage::disk('s3')->put($testFilePath, 'Test content');

Storage::disk('s3')->assertExists($testFilePath);

TextractUsingS3Upload::cleanupFileUsing(function ($path) {
Storage::disk('s3')->delete($path);
});

$textractUsingS3Upload->cleanup($testFilePath);

Storage::disk('s3')->assertMissing($testFilePath);
});

it('overrides the default file path generation with a custom callback', function () {
Storage::fake('s3');

$mock = Mockery::mock(TextractService::class);
$mock->shouldReceive('s3ObjectToText');

$textractUsingS3Upload = new TextractUsingS3Upload($mock);

$customFilePath = 'custom-path/custom-file.pdf';

TextractUsingS3Upload::generateFilePathUsing(function () use ($customFilePath) {
return $customFilePath;
});

expect($textractUsingS3Upload->getFilePath())->toBe($customFilePath);

Storage::disk('s3')->put($textractUsingS3Upload->getFilePath(), 'Test content');

Storage::disk('s3')->assertExists($customFilePath);
});

0 comments on commit 9bcf76f

Please sign in to comment.