Skip to content

Commit e18362d

Browse files
committed
feature: file_column anonymizer, inject multi-column samples in database from a csv file
1 parent aa59caf commit e18362d

File tree

6 files changed

+190
-5
lines changed

6 files changed

+190
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Next
44

5+
* [feature] 🌟 File multi-column anonymizer, inject sample rows in database from a CSV file.
56
* [feature] 🌟 File enum anonymizer, inject samples in database from a plain text or CSV file.
67
* [feature] 🌟 String pattern anonymizer, build complex strings by fetching values from other anonymizers.
78

src/Anonymization/Anonymizer/AbstractMultipleColumnAnonymizer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ protected function validateOptions(): void
5858

5959
// We only validate column options here.
6060
// Other ones will be validated by each implementation.
61-
$options = \array_filter(
61+
$columnOptions = \array_filter(
6262
$this->options->all(),
6363
fn ($key) => \in_array($key, $this->getColumnNames()),
6464
ARRAY_FILTER_USE_KEY
6565
);
6666

67-
if (\count(\array_unique($options)) < \count($options)) {
67+
if (\count(\array_unique($columnOptions)) < \count($columnOptions)) {
6868
throw new \InvalidArgumentException("The same column has been mapped twice.");
6969
}
7070
}

src/Anonymization/Anonymizer/AnonymizerRegistry.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class AnonymizerRegistry
1818
Core\DateAnonymizer::class,
1919
Core\EmailAnonymizer::class,
2020
Core\FileEnumAnonymizer::class,
21+
Core\FileMultipleColumnAnonymizer::class,
2122
Core\FirstNameAnonymizer::class,
2223
Core\FloatAnonymizer::class,
2324
Core\IbanBicAnonymizer::class,
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\AbstractMultipleColumnAnonymizer;
8+
use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer;
9+
use MakinaCorpus\DbToolsBundle\Error\ConfigurationException;
10+
use MakinaCorpus\DbToolsBundle\Helper\FileReader;
11+
12+
#[AsAnonymizer(
13+
name: 'file_column',
14+
pack: 'core',
15+
description: <<<TXT
16+
Anonymize multiple text values using a random row from the given file.
17+
Options are:
18+
- 'columns': column names that matches file columns. If you need to
19+
skip one of the file columns, simply set null instead of a name.
20+
Please remember that other option names defined here cannot be
21+
column names.
22+
- 'source': filename to load, filename must be absolute, or relative
23+
to the configuration file directory.
24+
- 'file_csv_enclosure': if file is a CSV, use this as the enclosure
25+
character (default is '"').
26+
- 'file_csv_escape': if file is a CSV, use this as the escape
27+
character (default is '\\').
28+
- 'file_csv_separator': if file is a CSV, use this as the separator
29+
character (default is ',').
30+
- 'file_skip_header': when reading any file, set this to true to skip
31+
the first line (default is false).
32+
TXT
33+
)]
34+
class FileMultipleColumnAnonymizer extends AbstractMultipleColumnAnonymizer
35+
{
36+
#[\Override]
37+
protected function validateOptions(): void
38+
{
39+
parent::validateOptions();
40+
41+
FileReader::ensureFile($this->options->getString('source', null, true));
42+
43+
$columns = $this->options->get('columns', null, true);
44+
if (!\is_array($columns)) {
45+
throw new ConfigurationException("'columns' must be an array of string or null values.");
46+
}
47+
$invalidNames = ['source', 'columns', 'file_csv_enclosure', 'file_csv_escape', 'file_csv_separator', 'file_skip_header'];
48+
foreach ($columns as $index => $column) {
49+
if (\in_array($column, $invalidNames)) {
50+
throw new ConfigurationException(\sprintf("'columns' values cannot be one of ('%s') for column #%d.", \implode("', '", $invalidNames), $index));
51+
}
52+
if (!\is_string($column) && null !== $column) {
53+
throw new ConfigurationException(\sprintf("'columns' must be an array of string or null values (invalid type for column #%d.", $index));
54+
}
55+
}
56+
}
57+
58+
#[\Override]
59+
protected function getColumnNames(): array
60+
{
61+
$ret = [];
62+
63+
$ignored = 0;
64+
foreach ($this->options->get('columns', null, true) as $name) {
65+
if (null === $name) {
66+
// It's easier to proceed this way than to strip down each
67+
// sample rows from the ignored columns in getSamples().
68+
// Even though, it would be cleaner, let's keep everything
69+
// simple for now.
70+
$ret[] = '_ignored' . ($ignored++);
71+
} else {
72+
$ret[] = $name;
73+
}
74+
}
75+
76+
return $ret;
77+
}
78+
79+
#[\Override]
80+
protected function getSamples(): array
81+
{
82+
return \iterator_to_array(
83+
FileReader::readColumnFile(
84+
$this->options->getString('source', null, true),
85+
$this->options,
86+
),
87+
);
88+
}
89+
}

tests/Functional/Anonymizer/Core/FileEnumAnonymizerTest.php

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
namespace MakinaCorpus\DbToolsBundle\Tests\Functional\Anonymizer\Core;
66

77
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Options;
8-
use MakinaCorpus\DbToolsBundle\Anonymization\Config\AnonymizerConfig;
98
use MakinaCorpus\DbToolsBundle\Test\FunctionalTestCase;
109

1110
class FileEnumAnonymizerTest extends FunctionalTestCase
@@ -44,12 +43,12 @@ public function testAnonymize(): void
4443
// File contents.
4544
$sample = ['foo', 'a', '1'];
4645

47-
$anonymizator = $this->createAnonymizatorWithConfig(new AnonymizerConfig(
46+
$anonymizator = $this->createAnonymizatorArbitrary(
4847
'table_test',
4948
'data',
5049
'file_enum',
5150
new Options(['source' => \dirname(__DIR__, 3) . '/Resources/Anonymization/Pack/resources/enum-file.txt'])
52-
));
51+
);
5352

5453
self::assertSame(
5554
'test1',
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Tests\Functional\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Options;
8+
use MakinaCorpus\DbToolsBundle\Test\FunctionalTestCase;
9+
10+
class FileMultipleColumnAnonymizerTest extends FunctionalTestCase
11+
{
12+
/** @before */
13+
protected function createTestData(): void
14+
{
15+
$this->createOrReplaceTable(
16+
'table_test',
17+
[
18+
'id' => 'integer',
19+
'column1' => 'string',
20+
'column2' => 'string',
21+
],
22+
[
23+
[
24+
'id' => 1,
25+
'column1' => 'test1',
26+
'column2' => 'test1',
27+
],
28+
[
29+
'id' => 2,
30+
'column1' => 'test2',
31+
'column2' => 'test2',
32+
],
33+
[
34+
'id' => 3,
35+
'column1' => 'test3',
36+
'column2' => 'test3',
37+
],
38+
[
39+
'id' => 4,
40+
],
41+
],
42+
);
43+
}
44+
45+
public function testAnonymize(): void
46+
{
47+
$anonymizator = $this->createAnonymizatorArbitrary(
48+
'table_test',
49+
'data',
50+
'file_column',
51+
new Options([
52+
'source' => \dirname(__DIR__, 3) . '/Resources/Anonymization/Pack/resources/enum-file.csv',
53+
'columns' => ['pif', null, 'pouf'],
54+
'pif' => 'column1',
55+
'pouf' => 'column2',
56+
])
57+
);
58+
59+
// Values from CSV.
60+
$samplePaf = ['foo', 'a', '1', 'cat'];
61+
$samplePouf = ['baz', 'c', '3', 'girafe'];
62+
63+
self::assertSame(
64+
"test1",
65+
$this->getDatabaseSession()->executeQuery('select column1 from table_test where id = 1')->fetchOne(),
66+
);
67+
68+
$anonymizator->anonymize();
69+
70+
$datas = $this->getDatabaseSession()->executeQuery('select * from table_test order by id asc')->fetchAllAssociative();
71+
72+
self::assertNotNull($datas[0]);
73+
self::assertNotSame('test1', $datas[0]['column1']);
74+
self::assertNotSame('test1', $datas[0]['column2']);
75+
self::assertContains($datas[0]['column1'], $samplePaf);
76+
self::assertContains($datas[0]['column2'], $samplePouf);
77+
78+
self::assertNotNull($datas[1]);
79+
self::assertNotSame('test2', $datas[1]['column1']);
80+
self::assertNotSame('test2', $datas[1]['column2']);
81+
self::assertContains($datas[1]['column1'], $samplePaf);
82+
self::assertContains($datas[1]['column2'], $samplePouf);
83+
84+
self::assertNotNull($datas[2]);
85+
self::assertNotSame('test3', $datas[2]['column1']);
86+
self::assertNotSame('test3', $datas[2]['column2']);
87+
self::assertContains($datas[2]['column1'], $samplePaf);
88+
self::assertContains($datas[2]['column2'], $samplePouf);
89+
90+
// self::assertNull($datas[3]['column1']);
91+
// self::assertNull($datas[3]['my_secondary_address']);
92+
93+
self::assertCount(4, \array_unique(\array_map(fn ($value) => \serialize($value), $datas)), 'All generated values are different.');
94+
}
95+
}

0 commit comments

Comments
 (0)