Skip to content

Commit e2d1ef7

Browse files
committed
feature: file_column anonymizer, inject multi-column samples in database from a csv file
1 parent fa87643 commit e2d1ef7

File tree

5 files changed

+189
-2
lines changed

5 files changed

+189
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Next
44

5+
* [feature] 🌟 File multi-column anonymizer, inject sample rows in database from a CSV file.
56
* [feature] 🌟 File enum anonymizer, inject samples in database from a plain text or CSV file.
67
* [feature] 🌟 String pattern anonymizer, build complex strings by fetching values from other anonymizers.
78

src/Anonymization/Anonymizer/AbstractMultipleColumnAnonymizer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ protected function validateOptions(): void
5858

5959
// We only validate column options here.
6060
// Other ones will be validated by each implementation.
61-
$options = \array_filter(
61+
$columnOptions = \array_filter(
6262
$this->options->all(),
6363
fn ($key) => \in_array($key, $this->getColumnNames()),
6464
ARRAY_FILTER_USE_KEY
6565
);
6666

67-
if (\count(\array_unique($options)) < \count($options)) {
67+
if (\count(\array_unique($columnOptions)) < \count($columnOptions)) {
6868
throw new \InvalidArgumentException("The same column has been mapped twice.");
6969
}
7070
}

src/Anonymization/Anonymizer/AnonymizerRegistry.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class AnonymizerRegistry
1818
Core\DateAnonymizer::class,
1919
Core\EmailAnonymizer::class,
2020
Core\FileEnumAnonymizer::class,
21+
Core\FileMultipleColumnAnonymizer::class,
2122
Core\FirstNameAnonymizer::class,
2223
Core\FloatAnonymizer::class,
2324
Core\IbanBicAnonymizer::class,
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\AbstractMultipleColumnAnonymizer;
8+
use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer;
9+
use MakinaCorpus\DbToolsBundle\Error\ConfigurationException;
10+
use MakinaCorpus\DbToolsBundle\Helper\FileReader;
11+
12+
#[AsAnonymizer(
13+
name: 'file_column',
14+
pack: 'core',
15+
description: <<<TXT
16+
Anonymize multiple text values using a random row from the given file.
17+
Options are:
18+
- 'columns': column names that matches file columns. If you need to
19+
skip one of the file columns, simply set null instead of a name.
20+
Please remember that other option names defined here cannot be
21+
column names.
22+
- 'source': filename to load, filename must be absolute, or relative
23+
to the configuration file directory.
24+
- 'file_csv_enclosure': if file is a CSV, use this as the enclosure
25+
character (default is '"').
26+
- 'file_csv_escape': if file is a CSV, use this as the escape
27+
character (default is '\\').
28+
- 'file_csv_separator': if file is a CSV, use this as the separator
29+
character (default is ',').
30+
- 'file_skip_header': when reading any file, set this to true to skip
31+
the first line (default is false).
32+
TXT
33+
)]
34+
class FileMultipleColumnAnonymizer extends AbstractMultipleColumnAnonymizer
35+
{
36+
#[\Override]
37+
protected function validateOptions(): void
38+
{
39+
parent::validateOptions();
40+
41+
FileReader::ensureFile($this->options->getString('source', null, true));
42+
43+
$columns = $this->options->get('columns', null, true);
44+
if (!\is_array($columns)) {
45+
throw new ConfigurationException("'columns' must be an array of string or null values.");
46+
}
47+
$invalidNames = ['source', 'columns', 'file_csv_enclosure', 'file_csv_escape', 'file_csv_separator', 'file_skip_header'];
48+
foreach ($columns as $index => $column) {
49+
if (\in_array($column, $invalidNames)) {
50+
throw new ConfigurationException(\sprintf("'columns' values cannot be one of ('%s') for column #%d.", \implode("', '", $invalidNames), $index));
51+
}
52+
if (!\is_string($column) && null !== $column) {
53+
throw new ConfigurationException(\sprintf("'columns' must be an array of string or null values (invalid type for column #%d.", $index));
54+
}
55+
}
56+
}
57+
58+
#[\Override]
59+
protected function getColumnNames(): array
60+
{
61+
$ret = [];
62+
63+
$ignored = 0;
64+
foreach ($this->options->get('columns', null, true) as $name) {
65+
if (null === $name) {
66+
// It's easier to proceed this way than to strip down each
67+
// sample rows from the ignored columns in getSamples().
68+
// Even though, it would be cleaner, let's keep everything
69+
// simple for now.
70+
$ret[] = '_ignored' . ($ignored++);
71+
} else {
72+
$ret[] = $name;
73+
}
74+
}
75+
76+
return $ret;
77+
}
78+
79+
#[\Override]
80+
protected function getSamples(): array
81+
{
82+
return \iterator_to_array(
83+
FileReader::readColumnFile(
84+
$this->options->getString('source', null, true),
85+
$this->options,
86+
),
87+
);
88+
}
89+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace MakinaCorpus\DbToolsBundle\Tests\Functional\Anonymizer\Core;
6+
7+
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Options;
8+
use MakinaCorpus\DbToolsBundle\Anonymization\Config\AnonymizerConfig;
9+
use MakinaCorpus\DbToolsBundle\Test\FunctionalTestCase;
10+
11+
class FileMultipleColumnAnonymizerTest extends FunctionalTestCase
12+
{
13+
/** @before */
14+
protected function createTestData(): void
15+
{
16+
$this->createOrReplaceTable(
17+
'table_test',
18+
[
19+
'id' => 'integer',
20+
'column1' => 'string',
21+
'column2' => 'string',
22+
],
23+
[
24+
[
25+
'id' => 1,
26+
'column1' => 'test1',
27+
'column2' => 'test1',
28+
],
29+
[
30+
'id' => 2,
31+
'column1' => 'test2',
32+
'column2' => 'test2',
33+
],
34+
[
35+
'id' => 3,
36+
'column1' => 'test3',
37+
'column2' => 'test3',
38+
],
39+
[
40+
'id' => 4,
41+
],
42+
],
43+
);
44+
}
45+
46+
public function testAnonymize(): void
47+
{
48+
$anonymizator = $this->createAnonymizatorWithConfig(new AnonymizerConfig(
49+
'table_test',
50+
'data',
51+
'file_column',
52+
new Options([
53+
'source' => \dirname(__DIR__, 3) . '/Resources/Anonymization/Pack/resources/enum-file.csv',
54+
'columns' => ['pif', null, 'pouf'],
55+
'pif' => 'column1',
56+
'pouf' => 'column2',
57+
])
58+
));
59+
60+
// Values from CSV.
61+
$samplePaf = ['foo', 'a', '1', 'cat'];
62+
$samplePouf = ['baz', 'c', '3', 'girafe'];
63+
64+
self::assertSame(
65+
"test1",
66+
$this->getDatabaseSession()->executeQuery('select column1 from table_test where id = 1')->fetchOne(),
67+
);
68+
69+
$anonymizator->anonymize();
70+
71+
$datas = $this->getDatabaseSession()->executeQuery('select * from table_test order by id asc')->fetchAllAssociative();
72+
73+
self::assertNotNull($datas[0]);
74+
self::assertNotSame('test1', $datas[0]['column1']);
75+
self::assertNotSame('test1', $datas[0]['column2']);
76+
self::assertContains($datas[0]['column1'], $samplePaf);
77+
self::assertContains($datas[0]['column2'], $samplePouf);
78+
79+
self::assertNotNull($datas[1]);
80+
self::assertNotSame('test2', $datas[1]['column1']);
81+
self::assertNotSame('test2', $datas[1]['column2']);
82+
self::assertContains($datas[1]['column1'], $samplePaf);
83+
self::assertContains($datas[1]['column2'], $samplePouf);
84+
85+
self::assertNotNull($datas[2]);
86+
self::assertNotSame('test3', $datas[2]['column1']);
87+
self::assertNotSame('test3', $datas[2]['column2']);
88+
self::assertContains($datas[2]['column1'], $samplePaf);
89+
self::assertContains($datas[2]['column2'], $samplePouf);
90+
91+
// self::assertNull($datas[3]['column1']);
92+
// self::assertNull($datas[3]['my_secondary_address']);
93+
94+
self::assertCount(4, \array_unique(\array_map(fn ($value) => \serialize($value), $datas)), 'All generated values are different.');
95+
}
96+
}

0 commit comments

Comments
 (0)