Skip to content

Commit

Permalink
Switch exports to Zstd, from bzip2
Browse files Browse the repository at this point in the history
The main benefit of this format for our users is that it decompresses
much faster than bzip2, even at high compression levels.

At level 19 it compresses even better than bzip2 for our files,
hopefully the compression time is still acceptable, if not we can reduce
it as to not overwork the server, at the price of some slightly bigger
files.

On my i7-8700K, unarchiving sentences.tar.bz2 takes 15.5s, compared to
994ms for sentences.csv.zst compressed at level 19.  The file is 183 MiB
compared to 197 MiB with bzip2.  We could go down to 167 MiB with level
22 (which decompresses in 941ms), but compression time starts to get
much higher, not sure this is worth it.

The only downside I see to this change is that user automation will have
to be changed, so perhaps announce it somehow before deploying it.
  • Loading branch information
linkmauve committed Dec 9, 2024
1 parent 3206771 commit e41635e
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 41 deletions.
12 changes: 6 additions & 6 deletions ansible/roles/setup_database/tasks/import_csv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
url: "{{ download_url }}{{ item }}"
dest: "/tmp/{{ item }}"
with_items:
- sentences.tar.bz2
- links.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tag_metadata.csv
- tags_detailed.tar.bz2
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Unpacking csv's
Expand All @@ -34,9 +34,9 @@
owner: mysql
group: mysql
with_items:
- sentences.tar.bz2
- links.tar.bz2
- tags_detailed.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Import sentences in the tatoeba database
Expand Down
36 changes: 18 additions & 18 deletions docs/cron/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,26 @@ mv /var/tmp/*csv "$DL_DIR"

echo "Starting tarring at $(date -Iseconds)"
cd "$DL_DIR"
tar -cjf sentences_base.tar.bz2 sentences_base.csv
tar -cjf sentences_detailed.tar.bz2 sentences_detailed.csv
tar -cjf links.tar.bz2 links.csv
tar -cjf sentences.tar.bz2 sentences.csv
tar -cjf contributions.tar.bz2 contributions.csv
zstd -19 sentences_base.csv
zstd -19 sentences_detailed.csv
zstd -19 links.csv
zstd -19 sentences.csv
zstd -19 contributions.csv
rm contributions.csv
tar -cjf comments.tar.bz2 sentence_comments.csv
zstd -19 sentence_comments.csv
rm sentence_comments.csv
tar -cjf wall.tar.bz2 wall_posts.csv
zstd -19 wall_posts.csv
rm wall_posts.csv
tar -cjf tags.tar.bz2 tags.csv
tar -cjf user_lists.tar.bz2 user_lists.csv
tar -cjf sentences_in_lists.tar.bz2 sentences_in_lists.csv
tar -cjf jpn_indices.tar.bz2 jpn_indices.csv
tar -cjf sentences_with_audio.tar.bz2 sentences_with_audio.csv
tar -cjf user_languages.tar.bz2 user_languages.csv
tar -cjf tags_detailed.tar.bz2 tags_detailed.csv
tar -cjf sentences_CC0.tar.bz2 sentences_CC0.csv
tar -cjf transcriptions.tar.bz2 transcriptions.csv
tar -cjf sentences_base.tar.bz2 sentences_base.csv
zstd -19 tags.csv
zstd -19 user_lists.csv
zstd -19 sentences_in_lists.csv
zstd -19 jpn_indices.csv
zstd -19 sentences_with_audio.csv
zstd -19 user_languages.csv
zstd -19 tags_detailed.csv
zstd -19 sentences_CC0.csv
zstd -19 transcriptions.csv
zstd -19 sentences_base.csv

echo "Starting language splitting for sentences at $(date -Iseconds)"
# Create per-language files for the different sentences files
Expand Down Expand Up @@ -173,7 +173,7 @@ mysql --skip-column-names --batch tatoeba -e \
}'

echo "Starting cleanup at $(date -Iseconds)"
find $TEMP_DIR -path '*tsv' -exec bzip2 -qf '{}' +
find $TEMP_DIR -path '*tsv' -exec zstd -19 -qf '{}' +
rm -rf $DL_DIR/per_language
rm transcriptions.csv
mv -f $TEMP_DIR $DL_DIR
Expand Down
20 changes: 10 additions & 10 deletions src/Template/Pages/downloads.ctp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_base.tar.bz2">sentences_base.tar.bz2</a>
<a href="<?= $download_url ?>sentences_base.csv.zst">sentences_base.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -323,7 +323,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>links.tar.bz2">links.tar.bz2</a>
<a href="<?= $download_url ?>links.csv.zst">links.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -350,7 +350,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>tags.tar.bz2">tags.tar.bz2</a>
<a href="<?= $download_url ?>tags.csv.zst">tags.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -376,7 +376,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_lists.tar.bz2">user_lists.tar.bz2</a>
<a href="<?= $download_url ?>user_lists.csv.zst">user_lists.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -400,8 +400,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_in_lists.tar.bz2">
sentences_in_lists.tar.bz2
<a href="<?= $download_url ?>sentences_in_lists.csv.zst">
sentences_in_lists.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -429,7 +429,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>jpn_indices.tar.bz2">jpn_indices.tar.bz2</a>
<a href="<?= $download_url ?>jpn_indices.csv.zst">jpn_indices.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -460,8 +460,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_with_audio.tar.bz2">
sentences_with_audio.tar.bz2
<a href="<?= $download_url ?>sentences_with_audio.csv.zst">
sentences_with_audio.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -501,7 +501,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_languages.tar.bz2">user_languages.tar.bz2</a>
<a href="<?= $download_url ?>user_languages.csv.zst">user_languages.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down
4 changes: 2 additions & 2 deletions src/View/Helper/DownloadsHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ private function availableFiles($basename) {
);

$dir = new Folder($perLanguageDir);
$paths = $dir->findRecursive(".*$basename\.tsv\.bz2$");
$paths = $dir->findRecursive(".*$basename\.tsv\.zst$");
$map = [];
foreach ($paths as $path) {
$path = substr($path, strlen($perLanguageDir) + 1);
Expand All @@ -74,7 +74,7 @@ private function availableFiles($basename) {
public function createOptions($basename) {
$urlForAll = Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
);
$options[0] = [
'language' => __('All languages'),
Expand Down
10 changes: 5 additions & 5 deletions tests/TestCase/View/Helper/DownloadsHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class DownloadsHelperTest extends TestCase {

private static function createTempTree() {
$languages = ['eng', 'fra', 'jpn', 'unknown'];
$files = ['sentences.tsv.bz2', 'sentences_detailed.tsv.bz2', 'sentences_CC0.tsv.bz2'];
$files = ['sentences.tsv.zst', 'sentences_detailed.tsv.zst', 'sentences_CC0.tsv.zst'];
foreach ($languages as $lang) {
$path = Folder::addPathElement(TMP, ['exports', 'per_language', $lang]);
$subdir = new Folder($path, true);
Expand Down Expand Up @@ -54,7 +54,7 @@ public function testCreateOptions_InvalidBasename() {

$this->assertEquals(1, count($options));
$this->assertEquals(
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.tar.bz2"),
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.csv.zst"),
$options[0]['url']
);
}
Expand All @@ -77,14 +77,14 @@ public function testCreateOptions_ValidBasename($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
['per_language', 'eng', "eng_$basename.tsv.bz2"]
['per_language', 'eng', "eng_$basename.tsv.zst"]
),
$options[1]['url']
);
Expand All @@ -103,7 +103,7 @@ public function testCreateOptions_NoPerLanguageFilesAvailable($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
Expand Down

0 comments on commit e41635e

Please sign in to comment.