Skip to content

Commit 46187c1

Browse files
committed
feature: file_column anonymizer that inject sample rows in database from a csv file
1 parent c129af9 commit 46187c1

File tree

7 files changed

+334
-2
lines changed

7 files changed

+334
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Next
44

5+
* [feature] 🌟 File multi-column anonymizer, inject sample rows in database from a CSV file.
56
* [feature] 🌟 File enum anonymizer, inject samples in database from a plain text or CSV file.
67
* [feature] 🌟 String pattern anonymizer, build complex strings by fetching values from other anonymizers.
78

docs/content/anonymization/core-anonymizers.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ This page list all *Anonymizers* provided by *DbToolsBundle*.
1515
<!--@include: ./core-anonymizers/string.md-->
1616
<!--@include: ./core-anonymizers/pattern.md-->
1717
<!--@include: ./core-anonymizers/file-enum.md-->
18+
<!--@include: ./core-anonymizers/file-column.md-->
1819
<!--@include: ./core-anonymizers/lastname.md-->
1920
<!--@include: ./core-anonymizers/firstname.md-->
2021
<!--@include: ./core-anonymizers/lorem-ipsum.md-->
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
## File multiple column
2+
3+
This Anonymizer will anonymize multiple columns at once using value rows from a
4+
input file. As of now, only CSV files are supported.
5+
6+
This aninymizer behaves like any other multiple column anonymizer and allows you
7+
to arbitrarily map any sample column into any database table column using the
8+
anonymizer options.
9+
10+
Given the following file:
11+
12+
```txt
13+
Number,Foo,Animal
14+
1,foo,cat
15+
2,bar,dog
16+
3,baz,girafe
17+
```
18+
19+
Then:
20+
21+
@@@ standalone docker
22+
23+
```yaml [YAML]
24+
# db_tools.config.yaml
25+
anonymization:
26+
default:
27+
customer:
28+
my_data:
29+
anonymizer: file_column
30+
options:
31+
source: ./resources/my_data.csv
32+
# Define your CSV file column names.
33+
columns: [number, foo, animal]
34+
# Other allowed options.
35+
file_skip_header: true
36+
# Now your columns, keys are CSV column names
37+
# you set upper, values are your database column
38+
# names.
39+
number: my_integer_column
40+
foo: my_foo_column
41+
animal: my_animal_column
42+
#...
43+
```
44+
45+
@@@
46+
@@@ symfony
47+
48+
::: code-group
49+
```php [Attribute]
50+
namespace App\Entity;
51+
52+
use Doctrine\ORM\Mapping as ORM;
53+
use MakinaCorpus\DbToolsBundle\Attribute\Anonymize;
54+
55+
#[ORM\Entity()]
56+
#[ORM\Table(name: 'customer')]
57+
#[Anonymize(type: 'string', options: [ // [!code ++]
58+
'source' => './resources/my_data.csv', // [!code ++]
59+
// Define your CSV file column names. // [!code ++]
60+
'columns': ['number', 'foo', 'animal'], // [!code ++]
61+
// Other allowed options. // [!code ++]
62+
'file_skip_header' => true, // [!code ++]
63+
// Now your columns, keys are CSV column names // [!code ++]
64+
// you set upper, values are your database column // [!code ++]
65+
// names. // [!code ++]
66+
'number' => 'my_integer_column', // [!code ++]
67+
'foo' => 'my_foo_column', // [!code ++]
68+
'animal' => 'my_animal_column', // [!code ++]
69+
])] // [!code ++]
70+
class Customer
71+
{
72+
// ...
73+
74+
#[ORM\Column(length: 255)]
75+
private ?string $myNumber = null;
76+
77+
#[ORM\Column(length: 255)]
78+
private ?string $myFoo = null;
79+
80+
#[ORM\Column(length: 255)]
81+
private ?string $myAnimal = null;
82+
83+
// ...
84+
}
85+
```
86+
87+
```yaml [YAML]
88+
# config/anonymization.yaml
89+
customer:
90+
my_data:
91+
anonymizer: file_column
92+
options:
93+
source: ./resources/my_data.csv
94+
# Define your CSV file column names.
95+
columns: [number, foo, animal]
96+
# Other allowed options.
97+
file_skip_header: true
98+
# Now your columns, keys are CSV column names
99+
# you set upper, values are your database column
100+
# names.
101+
number: my_integer_column
102+
foo: my_foo_column
103+
animal: my_animal_column
104+
#...
105+
```
106+
:::
107+
108+
:::warning
109+
This anonymizer works at the *table level* which means that the PHP attribute
110+
cannot target object properties: you must specify table column names and not
111+
PHP class property names.
112+
:::
113+
114+
@@@
115+
116+
When parsing a file file, you can set the following options as well:
117+
- `file_csv_enclosure`: if file is a CSV, use this as the enclosure character (default is `'"'`).
118+
- `file_csv_escape`: if file is a CSV, use this as the escape character (default is `'\\'`).
119+
- `file_csv_separator`: if file is a CSV, use this as the separator character (default is `','`).
120+
- `file_skip_header`: when reading any file, set this to true to skip the first line (default is `false`).
121+
122+
:::tip
123+
The filename can be absolute, or relative. For relative file resolution
124+
please see [*File name resolution*](#file-name-resolution)
125+
:::

src/Anonymization/Anonymizer/AbstractMultipleColumnAnonymizer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ protected function validateOptions(): void
5858

5959
// We only validate column options here.
6060
// Other ones will be validated by each implementation.
61-
$options = \array_filter(
61+
$columnOptions = \array_filter(
6262
$this->options->all(),
6363
fn ($key) => \in_array($key, $this->getColumnNames()),
6464
ARRAY_FILTER_USE_KEY
6565
);
6666

67-
if (\count(\array_unique($options)) < \count($options)) {
67+
if (\count(\array_unique($columnOptions)) < \count($columnOptions)) {
6868
throw new \InvalidArgumentException("The same column has been mapped twice.");
6969
}
7070
}

src/Anonymization/Anonymizer/AnonymizerRegistry.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class AnonymizerRegistry
1818
Core\DateAnonymizer::class,
1919
Core\EmailAnonymizer::class,
2020
Core\FileEnumAnonymizer::class,
21+
Core\FileMultipleColumnAnonymizer::class,
2122
Core\FirstNameAnonymizer::class,
2223
Core\FloatAnonymizer::class,
2324
Core\IbanBicAnonymizer::class,
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\AbstractMultipleColumnAnonymizer;
8+
use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer;
9+
use MakinaCorpus\DbToolsBundle\Error\ConfigurationException;
10+
use MakinaCorpus\DbToolsBundle\Helper\FileReader;
11+
12+
#[AsAnonymizer(
13+
name: 'file_column',
14+
pack: 'core',
15+
description: <<<TXT
16+
Anonymize multiple text values using a random row from the given file.
17+
Options are:
18+
- 'columns': column names that matches file columns. If you need to
19+
skip one of the file columns, simply set null instead of a name.
20+
Please remember that other option names defined here cannot be
21+
column names.
22+
- 'source': filename to load, filename must be absolute, or relative
23+
to the configuration file directory.
24+
- 'file_csv_enclosure': if file is a CSV, use this as the enclosure
25+
character (default is '"').
26+
- 'file_csv_escape': if file is a CSV, use this as the escape
27+
character (default is '\\').
28+
- 'file_csv_separator': if file is a CSV, use this as the separator
29+
character (default is ',').
30+
- 'file_skip_header': when reading any file, set this to true to skip
31+
the first line (default is false).
32+
TXT
33+
)]
34+
class FileMultipleColumnAnonymizer extends AbstractMultipleColumnAnonymizer
35+
{
36+
private ?string $filename = null;
37+
38+
protected function getFilename(): string
39+
{
40+
if ($this->filename) {
41+
return $this->filename;
42+
}
43+
44+
$filename = $this->options->getString('source', null, true);
45+
46+
if ($basePath = $this->options->getString('base_path')) {
47+
$filename = FileReader::ensurePathAbsolute($filename, $basePath);
48+
}
49+
50+
FileReader::ensureFile($filename);
51+
52+
return $this->filename = $filename;
53+
}
54+
55+
#[\Override]
56+
protected function validateOptions(): void
57+
{
58+
parent::validateOptions();
59+
60+
$this->getFilename();
61+
62+
$columns = $this->options->get('columns', null, true);
63+
if (!\is_array($columns)) {
64+
throw new ConfigurationException("'columns' must be an array of string or null values.");
65+
}
66+
$invalidNames = ['source', 'columns', 'file_csv_enclosure', 'file_csv_escape', 'file_csv_separator', 'file_skip_header'];
67+
foreach ($columns as $index => $column) {
68+
if (\in_array($column, $invalidNames)) {
69+
throw new ConfigurationException(\sprintf("'columns' values cannot be one of ('%s') for column #%d.", \implode("', '", $invalidNames), $index));
70+
}
71+
if (!\is_string($column) && null !== $column) {
72+
throw new ConfigurationException(\sprintf("'columns' must be an array of string or null values (invalid type for column #%d.", $index));
73+
}
74+
}
75+
}
76+
77+
#[\Override]
78+
protected function getColumnNames(): array
79+
{
80+
$ret = [];
81+
82+
$ignored = 0;
83+
foreach ($this->options->get('columns', null, true) as $name) {
84+
if (null === $name) {
85+
// It's easier to proceed this way than to strip down each
86+
// sample rows from the ignored columns in getSamples().
87+
// Even though, it would be cleaner, let's keep everything
88+
// simple for now.
89+
$ret[] = '_ignored' . ($ignored++);
90+
} else {
91+
$ret[] = $name;
92+
}
93+
}
94+
95+
return $ret;
96+
}
97+
98+
#[\Override]
99+
protected function getSamples(): array
100+
{
101+
return \iterator_to_array(
102+
FileReader::readColumnFile(
103+
$this->getFilename(),
104+
$this->options,
105+
),
106+
);
107+
}
108+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Tests\Functional\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Options;
8+
use MakinaCorpus\DbToolsBundle\Anonymization\Config\AnonymizerConfig;
9+
use MakinaCorpus\DbToolsBundle\Test\FunctionalTestCase;
10+
11+
class FileMultipleColumnAnonymizerTest extends FunctionalTestCase
12+
{
13+
/** @before */
14+
protected function createTestData(): void
15+
{
16+
$this->createOrReplaceTable(
17+
'table_test',
18+
[
19+
'id' => 'integer',
20+
'column1' => 'string',
21+
'column2' => 'string',
22+
],
23+
[
24+
[
25+
'id' => 1,
26+
'column1' => 'test1',
27+
'column2' => 'test1',
28+
],
29+
[
30+
'id' => 2,
31+
'column1' => 'test2',
32+
'column2' => 'test2',
33+
],
34+
[
35+
'id' => 3,
36+
'column1' => 'test3',
37+
'column2' => 'test3',
38+
],
39+
[
40+
'id' => 4,
41+
],
42+
],
43+
);
44+
}
45+
46+
public function testAnonymize(): void
47+
{
48+
$anonymizator = $this->createAnonymizatorWithConfig(new AnonymizerConfig(
49+
'table_test',
50+
'data',
51+
'file_column',
52+
new Options([
53+
'source' => \dirname(__DIR__, 3) . '/Resources/Anonymization/Pack/resources/enum-file.csv',
54+
'columns' => ['pif', null, 'pouf'],
55+
'pif' => 'column1',
56+
'pouf' => 'column2',
57+
])
58+
));
59+
60+
// Values from CSV.
61+
$samplePaf = ['foo', 'a', '1', 'cat'];
62+
$samplePouf = ['baz', 'c', '3', 'girafe'];
63+
64+
self::assertSame(
65+
"test1",
66+
$this->getDatabaseSession()->executeQuery('select column1 from table_test where id = 1')->fetchOne(),
67+
);
68+
69+
$anonymizator->anonymize();
70+
71+
$datas = $this->getDatabaseSession()->executeQuery('select * from table_test order by id asc')->fetchAllAssociative();
72+
73+
self::assertNotNull($datas[0]);
74+
self::assertNotSame('test1', $datas[0]['column1']);
75+
self::assertNotSame('test1', $datas[0]['column2']);
76+
self::assertContains($datas[0]['column1'], $samplePaf);
77+
self::assertContains($datas[0]['column2'], $samplePouf);
78+
79+
self::assertNotNull($datas[1]);
80+
self::assertNotSame('test2', $datas[1]['column1']);
81+
self::assertNotSame('test2', $datas[1]['column2']);
82+
self::assertContains($datas[1]['column1'], $samplePaf);
83+
self::assertContains($datas[1]['column2'], $samplePouf);
84+
85+
self::assertNotNull($datas[2]);
86+
self::assertNotSame('test3', $datas[2]['column1']);
87+
self::assertNotSame('test3', $datas[2]['column2']);
88+
self::assertContains($datas[2]['column1'], $samplePaf);
89+
self::assertContains($datas[2]['column2'], $samplePouf);
90+
91+
// self::assertNull($datas[3]['column1']);
92+
// self::assertNull($datas[3]['my_secondary_address']);
93+
94+
self::assertCount(4, \array_unique(\array_map(fn ($value) => \serialize($value), $datas)), 'All generated values are different.');
95+
}
96+
}

0 commit comments

Comments
 (0)