Skip to content

Commit aa59caf

Browse files
committed
feature: file enum anonymizer, inject samples in database from a plain text or csv file
missing testing files
1 parent b0b3bb8 commit aa59caf

File tree

14 files changed

+533
-0
lines changed

14 files changed

+533
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Next
44

5+
* [feature] 🌟 File enum anonymizer, inject samples in database from a plain text or CSV file.
56
* [feature] 🌟 String pattern anonymizer, build complex strings by fetching values from other anonymizers.
67

78
## 2.0.3

docs/content/anonymization/core-anonymizers.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ This page list all *Anonymizers* provided by *DbToolsBundle*.
1414
<!--@include: ./core-anonymizers/md5.md-->
1515
<!--@include: ./core-anonymizers/string.md-->
1616
<!--@include: ./core-anonymizers/pattern.md-->
17+
<!--@include: ./core-anonymizers/file-enum.md-->
1718
<!--@include: ./core-anonymizers/lastname.md-->
1819
<!--@include: ./core-anonymizers/firstname.md-->
1920
<!--@include: ./core-anonymizers/lorem-ipsum.md-->
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
## File enum
2+
3+
This *Anonymizer* will fill configured column with a random value from a given sample fetched
4+
from a plain text or a CSV file.
5+
6+
Given the following file:
7+
8+
```txt
9+
none
10+
bad
11+
good
12+
expert
13+
```
14+
15+
Then:
16+
17+
@@@ standalone docker
18+
19+
```yaml [YAML]
20+
# db_tools.config.yaml
21+
anonymization:
22+
default:
23+
customer:
24+
level:
25+
anonymizer: file_enum
26+
options: {source: ./resources/levels.txt}
27+
#...
28+
```
29+
30+
@@@
31+
@@@ symfony
32+
33+
::: code-group
34+
```php [Attribute]
35+
namespace App\Entity;
36+
37+
use Doctrine\ORM\Mapping as ORM;
38+
use MakinaCorpus\DbToolsBundle\Attribute\Anonymize;
39+
40+
#[ORM\Entity()]
41+
#[ORM\Table(name: 'customer')]
42+
class Customer
43+
{
44+
// ...
45+
46+
#[ORM\Column(length: 255)]
47+
#[Anonymize(type: 'string', options: ['source' => "./resources/levels.txt"])] // [!code ++]
48+
private ?string $level = null;
49+
50+
// ...
51+
}
52+
```
53+
54+
```yaml [YAML]
55+
# config/anonymization.yaml
56+
customer:
57+
level:
58+
anonymizer: file_enum
59+
options: {source: ./resources/levels.txt}
60+
#...
61+
```
62+
:::
63+
64+
@@@
65+
66+
File will be read this way:
67+
- When using a plain text file, each line is a value, no matter what's inside.
68+
- When using a CSV file, the first column will be used instead.
69+
70+
When parsing a file file, you can set the following options as well:
71+
- `file_csv_enclosure`: if file is a CSV, use this as the enclosure character (default is `'"'`).
72+
- `file_csv_escape`: if file is a CSV, use this as the escape character (default is `'\\'`).
73+
- `file_csv_separator`: if file is a CSV, use this as the separator character (default is `','`).
74+
- `file_skip_header`: when reading any file, set this to true to skip the first line (default is `false`).
75+
76+
:::warning
77+
The filename can be absolute, or relative. When relative, it will be relative
78+
to the current PHP working directory.
79+
80+
Working with the PHP working directory is experimental.
81+
This might cause trouble depending upon your execution environment.
82+
83+
Future versions will allow a better directory selection.
84+
:::

src/Anonymization/Anonymizer/AnonymizerRegistry.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class AnonymizerRegistry
1717
Core\ConstantAnonymizer::class,
1818
Core\DateAnonymizer::class,
1919
Core\EmailAnonymizer::class,
20+
Core\FileEnumAnonymizer::class,
2021
Core\FirstNameAnonymizer::class,
2122
Core\FloatAnonymizer::class,
2223
Core\IbanBicAnonymizer::class,
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\AbstractEnumAnonymizer;
8+
use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer;
9+
use MakinaCorpus\DbToolsBundle\Helper\FileReader;
10+
11+
#[AsAnonymizer(
12+
name: 'file_enum',
13+
pack: 'core',
14+
description: <<<TXT
15+
Anonymize any text value using a random element from the given file.
16+
Options are:
17+
- 'source': filename to load, filename must be absolute, or relative
18+
to the configuration file directory.
19+
- 'file_csv_enclosure': if file is a CSV, use this as the enclosure
20+
character (default is '"').
21+
- 'file_csv_escape': if file is a CSV, use this as the escape
22+
character (default is '\\').
23+
- 'file_csv_separator': if file is a CSV, use this as the separator
24+
character (default is ',').
25+
- 'file_skip_header': when reading any file, set this to true to skip
26+
the first line (default is false).
27+
TXT
28+
)]
29+
class FileEnumAnonymizer extends AbstractEnumAnonymizer
30+
{
31+
#[\Override]
32+
protected function validateOptions(): void
33+
{
34+
parent::validateOptions();
35+
36+
FileReader::ensureFile($this->options->getString('source', null, true));
37+
}
38+
39+
#[\Override]
40+
protected function getSample(): array
41+
{
42+
return \iterator_to_array(
43+
FileReader::readEnumFile(
44+
$this->options->getString('source', null, true),
45+
$this->options,
46+
)
47+
);
48+
}
49+
}

src/Helper/FileReader.php

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Helper;
6+
7+
use MakinaCorpus\DbToolsBundle\Error\ConfigurationException;
8+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Options;
9+
10+
class FileReader
11+
{
12+
public static function getFileExtension(string $filename): ?string
13+
{
14+
$ext = null;
15+
if ($pos = \strrpos($filename, '.')) {
16+
$ext = \substr($filename, $pos + 1);
17+
}
18+
19+
return $ext;
20+
}
21+
22+
/**
23+
* Iterator on file contents.
24+
*
25+
* @return iterable<string>
26+
*/
27+
public static function readEnumFile(string $filename, ?Options $options = null, ?string $anonymizerId = null): iterable
28+
{
29+
$ext = self::getFileExtension($filename);
30+
31+
// no match() usage here because CSV cannot expressed as a single expression.
32+
if (null === $ext || 'txt' === $ext) {
33+
yield from self::readTxtFile($filename, $options, $anonymizerId);
34+
} elseif ('csv' === $ext || 'tsv' === $ext) {
35+
foreach (self::readCsvFile($filename, $options, $anonymizerId) as $line) {
36+
\assert(\is_array($line));
37+
if ($line) {
38+
yield $line[0];
39+
}
40+
}
41+
} elseif ($anonymizerId) {
42+
throw new ConfigurationException(\sprintf("Anonymizer '%s': unsupported enum data file type: '%s'.", $anonymizerId, $ext));
43+
} else {
44+
throw new ConfigurationException(\sprintf("Unsupported enum data file type: '%s'.", $ext));
45+
}
46+
}
47+
48+
/**
49+
* Iterator on column file contents.
50+
*
51+
* @return iterable<array<string>>
52+
*/
53+
public static function readColumnFile(string $filename, ?Options $options = null, ?string $anonymizerId = null): iterable
54+
{
55+
$ext = self::getFileExtension($filename);
56+
57+
// no match() usage here because CSV cannot expressed as a single expression.
58+
if ('csv' === $ext || 'tsv' === $ext) {
59+
yield from self::readCsvFile($filename, $options, $anonymizerId);
60+
} else {
61+
throw new ConfigurationException("Unsupported column data file type.");
62+
}
63+
}
64+
65+
/**
66+
* Iterator on plain text file lines.
67+
*
68+
* @return iterable<string>
69+
*/
70+
public static function readTxtFile(string $filename, ?Options $options = null, ?string $anonymizerId = null): iterable
71+
{
72+
self::ensureFile($filename, $anonymizerId);
73+
74+
$options ??= new Options();
75+
76+
$handle = null;
77+
try {
78+
$handle = \fopen($filename, 'r');
79+
80+
if (false === $handle) {
81+
if ($anonymizerId) {
82+
throw new ConfigurationException(\sprintf("Anonymizer '%s' could not open file: %s", $anonymizerId, $filename));
83+
} else {
84+
throw new ConfigurationException(\sprintf("Could not open file: %s", $filename));
85+
}
86+
}
87+
88+
$first = true;
89+
while ($line = \fgets($handle)) {
90+
$line = \trim($line); // Trim whitespaces (including end of line).
91+
92+
if ($first) {
93+
$first = false;
94+
if ($options->getBool('file_skip_header', false)) {
95+
continue; // Skip header.
96+
}
97+
}
98+
99+
if (empty($line)) {
100+
continue; // Empty line, ignore.
101+
}
102+
103+
yield $line;
104+
}
105+
} finally {
106+
if ($handle) {
107+
@\fclose($handle);
108+
}
109+
}
110+
}
111+
112+
/**
113+
* Iterator on CSV file contents.
114+
*
115+
* @return iterable<array<string>>
116+
*/
117+
public static function readCsvFile(string $filename, ?Options $options = null, ?string $anonymizerId = null): iterable
118+
{
119+
self::ensureFile($filename, $anonymizerId);
120+
121+
$options ??= new Options();
122+
123+
$handle = null;
124+
try {
125+
$handle = \fopen($filename, 'r');
126+
127+
if (false === $handle) {
128+
if ($anonymizerId) {
129+
throw new ConfigurationException(\sprintf("Anonymizer '%s' could not open file: %s", $anonymizerId, $filename));
130+
} else {
131+
throw new ConfigurationException(\sprintf("Could not open file: %s", $filename));
132+
}
133+
}
134+
135+
$separator = $options->getString('file_csv_separator', ',');
136+
$enclosure = $options->getString('file_csv_enclosure', '"');
137+
$escape = $options->getString('file_csv_escape', '\\');
138+
139+
$first = true;
140+
while ($line = \fgetcsv($handle, null, $separator, $enclosure, $escape)) {
141+
if ($first) {
142+
$first = false;
143+
if ($options->getBool('file_skip_header', false)) {
144+
continue; // Skip header.
145+
}
146+
}
147+
148+
if (!\array_filter($line)) {
149+
continue; // Empty line, ignore.
150+
}
151+
152+
yield $line;
153+
}
154+
} finally {
155+
if ($handle) {
156+
@\fclose($handle);
157+
}
158+
}
159+
}
160+
161+
public static function ensureFile(string $filename, ?string $anonymizerId = null): void
162+
{
163+
if (!\file_exists($filename)) {
164+
if ($anonymizerId) {
165+
throw new ConfigurationException(\sprintf("Anonymizer '%s' uses a non existing file: %s", $anonymizerId, $filename));
166+
} else {
167+
throw new ConfigurationException(\sprintf("Uses a non existing file: %s", $filename));
168+
}
169+
}
170+
if (!\is_file($filename)) {
171+
if ($anonymizerId) {
172+
throw new ConfigurationException(\sprintf("Anonymizer '%s' is not a regular file: %s", $anonymizerId, $filename));
173+
} else {
174+
throw new ConfigurationException(\sprintf("Is not a regular file: %s", $filename));
175+
}
176+
}
177+
if (!\is_readable($filename)) {
178+
if ($anonymizerId) {
179+
throw new ConfigurationException(\sprintf("Anonymizer '%s' file cannot be read: %s", $anonymizerId, $filename));
180+
} else {
181+
throw new ConfigurationException(\sprintf("File cannot be read: %s", $filename));
182+
}
183+
}
184+
}
185+
}

0 commit comments

Comments
 (0)