Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch exports to Zstd, from bzip2 #3148

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ansible/roles/setup_database/tasks/import_csv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
url: "{{ download_url }}{{ item }}"
dest: "/tmp/{{ item }}"
with_items:
- sentences.tar.bz2
- links.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tag_metadata.csv
- tags_detailed.tar.bz2
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Unpacking csv's
Expand All @@ -34,9 +34,9 @@
owner: mysql
group: mysql
with_items:
- sentences.tar.bz2
- links.tar.bz2
- tags_detailed.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Import sentences in the tatoeba database
Expand Down
56 changes: 37 additions & 19 deletions docs/cron/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,46 @@ mv /var/tmp/*csv "$DL_DIR"
mysql -u "$DB_USER" -p"$DB_PASS" "$DB" < "$ROOT"/docs/database/scripts/wwwjdic.sql
mv /var/tmp/*csv "$DL_DIR"

echo "Starting tarring at $(date -Iseconds)"
compress_csv () {
# TODO: Remove the bzipped tar archive once most users have migrated to the
# zstd file. During the transition period we should monitor which files
# get downloaded to get an insight about how the migration is going on.
#
# It wastes some CPU time each week, disk space, and on users machine their
# CPU time, hence the migration to Zstd.
tar -cjf "${1%csv}tar.bz2" "$1"
zstd -19 "$1"
}

compress_tsv () {
# TODO: Same as above, remove the bzip file once we see fewer requests to
# it.
bzip2 -qf "$1"
zstd -19 -qf "$1"
}

echo "Starting compressing at $(date -Iseconds)"
cd "$DL_DIR"
tar -cjf sentences_base.tar.bz2 sentences_base.csv
tar -cjf sentences_detailed.tar.bz2 sentences_detailed.csv
tar -cjf links.tar.bz2 links.csv
tar -cjf sentences.tar.bz2 sentences.csv
tar -cjf contributions.tar.bz2 contributions.csv
compress_csv sentences_base.csv
compress_csv sentences_detailed.csv
compress_csv links.csv
compress_csv sentences.csv
compress_csv contributions.csv
rm contributions.csv
tar -cjf comments.tar.bz2 sentence_comments.csv
compress_csv sentence_comments.csv
rm sentence_comments.csv
tar -cjf wall.tar.bz2 wall_posts.csv
compress_csv wall_posts.csv
rm wall_posts.csv
tar -cjf tags.tar.bz2 tags.csv
tar -cjf user_lists.tar.bz2 user_lists.csv
tar -cjf sentences_in_lists.tar.bz2 sentences_in_lists.csv
tar -cjf jpn_indices.tar.bz2 jpn_indices.csv
tar -cjf sentences_with_audio.tar.bz2 sentences_with_audio.csv
tar -cjf user_languages.tar.bz2 user_languages.csv
tar -cjf tags_detailed.tar.bz2 tags_detailed.csv
tar -cjf sentences_CC0.tar.bz2 sentences_CC0.csv
tar -cjf transcriptions.tar.bz2 transcriptions.csv
tar -cjf sentences_base.tar.bz2 sentences_base.csv
compress_csv tags.csv
compress_csv user_lists.csv
compress_csv sentences_in_lists.csv
compress_csv jpn_indices.csv
compress_csv sentences_with_audio.csv
compress_csv user_languages.csv
compress_csv tags_detailed.csv
compress_csv sentences_CC0.csv
compress_csv transcriptions.csv
compress_csv sentences_base.csv

echo "Starting language splitting for sentences at $(date -Iseconds)"
# Create per-language files for the different sentences files
Expand Down Expand Up @@ -173,7 +191,7 @@ mysql --skip-column-names --batch tatoeba -e \
}'

echo "Starting cleanup at $(date -Iseconds)"
find $TEMP_DIR -path '*tsv' -exec bzip2 -qf '{}' +
find $TEMP_DIR -path '*tsv' -exec compress_tsv '{}' +
rm -rf $DL_DIR/per_language
rm transcriptions.csv
mv -f $TEMP_DIR $DL_DIR
Expand Down
20 changes: 10 additions & 10 deletions src/Template/Pages/downloads.ctp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_base.tar.bz2">sentences_base.tar.bz2</a>
<a href="<?= $download_url ?>sentences_base.csv.zst">sentences_base.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -323,7 +323,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>links.tar.bz2">links.tar.bz2</a>
<a href="<?= $download_url ?>links.csv.zst">links.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -350,7 +350,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>tags.tar.bz2">tags.tar.bz2</a>
<a href="<?= $download_url ?>tags.csv.zst">tags.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -376,7 +376,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_lists.tar.bz2">user_lists.tar.bz2</a>
<a href="<?= $download_url ?>user_lists.csv.zst">user_lists.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -400,8 +400,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_in_lists.tar.bz2">
sentences_in_lists.tar.bz2
<a href="<?= $download_url ?>sentences_in_lists.csv.zst">
sentences_in_lists.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -429,7 +429,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>jpn_indices.tar.bz2">jpn_indices.tar.bz2</a>
<a href="<?= $download_url ?>jpn_indices.csv.zst">jpn_indices.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -460,8 +460,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_with_audio.tar.bz2">
sentences_with_audio.tar.bz2
<a href="<?= $download_url ?>sentences_with_audio.csv.zst">
sentences_with_audio.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -501,7 +501,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_languages.tar.bz2">user_languages.tar.bz2</a>
<a href="<?= $download_url ?>user_languages.csv.zst">user_languages.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down
4 changes: 2 additions & 2 deletions src/View/Helper/DownloadsHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ private function availableFiles($basename) {
);

$dir = new Folder($perLanguageDir);
$paths = $dir->findRecursive(".*$basename\.tsv\.bz2$");
$paths = $dir->findRecursive(".*$basename\.tsv\.zst$");
$map = [];
foreach ($paths as $path) {
$path = substr($path, strlen($perLanguageDir) + 1);
Expand All @@ -74,7 +74,7 @@ private function availableFiles($basename) {
public function createOptions($basename) {
$urlForAll = Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
);
$options[0] = [
'language' => __('All languages'),
Expand Down
10 changes: 5 additions & 5 deletions tests/TestCase/View/Helper/DownloadsHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class DownloadsHelperTest extends TestCase {

private static function createTempTree() {
$languages = ['eng', 'fra', 'jpn', 'unknown'];
$files = ['sentences.tsv.bz2', 'sentences_detailed.tsv.bz2', 'sentences_CC0.tsv.bz2'];
$files = ['sentences.tsv.zst', 'sentences_detailed.tsv.zst', 'sentences_CC0.tsv.zst'];
foreach ($languages as $lang) {
$path = Folder::addPathElement(TMP, ['exports', 'per_language', $lang]);
$subdir = new Folder($path, true);
Expand Down Expand Up @@ -54,7 +54,7 @@ public function testCreateOptions_InvalidBasename() {

$this->assertEquals(1, count($options));
$this->assertEquals(
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.tar.bz2"),
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.csv.zst"),
$options[0]['url']
);
}
Expand All @@ -77,14 +77,14 @@ public function testCreateOptions_ValidBasename($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
['per_language', 'eng', "eng_$basename.tsv.bz2"]
['per_language', 'eng', "eng_$basename.tsv.zst"]
),
$options[1]['url']
);
Expand All @@ -103,7 +103,7 @@ public function testCreateOptions_NoPerLanguageFilesAvailable($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
Expand Down