Skip to content

Commit 2a9cf4f

Browse files
Fix docs workflow cache key to enable cache reuse (#946)
* Fix docs workflow cache key and disable parallel on cold cache The MNE data cache key was using github.run_id, which creates a unique key for every workflow run. This prevented cache reuse, causing every docs build to download all datasets fresh from Zenodo. Changes: - Use static versioned cache key (v2) that persists across runs - Disable parallel Sphinx gallery builds when cache is cold to avoid Zenodo rate limiting during the initial dataset download - Add SPHINX_GALLERY_PARALLEL env var to control parallel execution When cache is hit, parallel builds are enabled for speed. When cache is cold (first run), sequential builds prevent rate limiting. * Add pre-download step with retry logic for cold cache When the MNE data cache is cold, pre-download datasets with: - Exponential backoff retry for Zenodo API rate limiting - Pre-caching of Zenodo metadata JSON - 30s delays between dataset downloads This ensures the first docs build succeeds by handling rate limiting gracefully before sphinx-gallery runs. * Fix Zhou2016 metadata cache path to use BIDS format * Simplify pre-download step to use MOABB dataset classes with retry * Expand pre-download to include all Zenodo datasets used in examples * Disable Sphinx parallel build on cold cache via SPHINX_JOBS env var * Make pre-download step more robust and fail on errors - Increase retries from 3 to 5 with longer delays (90s-360s) - Increase wait between datasets from 45s to 60s - Download all 4 Zhou2016 subjects - Fail workflow if any dataset download fails - Better logging with clear success/failure messages * Improve docs cache persistence and Zenodo downloads * [pre-commit.ci] auto fixes from pre-commit.com hooks * Use HTTPS for BNCI dataset downloads * Cache BNCI datasets in test workflow * Switch BNCI downloads to mirror host --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 45cd0f8 commit 2a9cf4f

File tree

8 files changed

+207
-21
lines changed

8 files changed

+207
-21
lines changed

.github/workflows/docs.yml

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,18 @@ jobs:
3232
python-version: ${{ matrix.python-version }}
3333
enable-cache: true
3434

35-
- name: Create/Restore MNE Data Cache
35+
- name: Restore MNE Data Cache
3636
id: cache-mne_data
37-
uses: actions/cache@v4
37+
uses: actions/cache/restore@v4
3838
with:
3939
path: ~/mne_data
40-
key: doc-${{ runner.os }}-mne-data-${{ github.run_id }}
40+
key: doc-${{ runner.os }}-mne-data-v2-${{ github.run_id }}
4141
restore-keys: |
42-
doc-${{ runner.os }}-mne-data-
42+
doc-${{ runner.os }}-mne-data-v2-
4343
4444
- name: Clean up corrupted cache
4545
run: |
46-
echo "Cache hit: ${{ steps.cache-mne_data.outputs.cache-hit }}"
46+
echo "Cache matched key: ${{ steps.cache-mne_data.outputs.cache-matched-key }}"
4747
mkdir -p ~/mne_data
4848
# Remove any incomplete extractions (e.g., BIDS.zip.unzip alongside final folder)
4949
if [ -d ~/mne_data/BIDS.zip.unzip ]; then
@@ -66,8 +66,76 @@ jobs:
6666
run: |
6767
uv pip install -e .[docs,deeplearning,optuna,external,carbonemission]
6868
69+
- name: Pre-download datasets (cold cache only)
70+
if: steps.cache-mne_data.outputs.cache-matched-key == ''
71+
run: |
72+
echo "Cache is cold, pre-downloading datasets with delays to avoid rate limiting..."
73+
python << 'EOF'
74+
import sys
75+
import time
76+
77+
# All datasets used in examples that download from Zenodo
78+
datasets_to_download = [
79+
# (module_path, class_name, num_subjects)
80+
('moabb.datasets', 'BNCI2014_001', 3),
81+
('moabb.datasets', 'BNCI2014_009', 2),
82+
('moabb.datasets', 'Zhou2016', 4), # All 4 subjects
83+
('moabb.datasets', 'AlexMI', 1),
84+
('moabb.datasets', 'Kalunga2016', 2),
85+
('moabb.datasets', 'Cattan2019_VR', 1),
86+
('moabb.datasets', 'Hinss2021', 1),
87+
]
88+
89+
failed = []
90+
for module_path, class_name, n_subjects in datasets_to_download:
91+
print(f'\n{"="*60}')
92+
print(f'Pre-downloading {class_name} ({n_subjects} subjects)...')
93+
print(f'{"="*60}')
94+
success = False
95+
for attempt in range(5): # More retries
96+
try:
97+
import importlib
98+
module = importlib.import_module(module_path)
99+
cls = getattr(module, class_name)
100+
ds = cls()
101+
subjects = ds.subject_list[:n_subjects]
102+
ds.download(subject_list=subjects)
103+
print(f'SUCCESS: {class_name} downloaded ({n_subjects} subjects)')
104+
success = True
105+
break
106+
except Exception as e:
107+
print(f'Attempt {attempt + 1}/5 failed: {e}')
108+
if attempt < 4:
109+
wait = 90 * (attempt + 1) # 90s, 180s, 270s, 360s
110+
print(f'Waiting {wait}s before retry...')
111+
time.sleep(wait)
112+
if not success:
113+
failed.append(class_name)
114+
print(f'FAILED: {class_name} after 5 attempts')
115+
# Wait between datasets to avoid rate limiting
116+
print('Waiting 60s before next dataset...')
117+
time.sleep(60)
118+
119+
if failed:
120+
print(f'\n{"="*60}')
121+
print(f'ERROR: Failed to download: {", ".join(failed)}')
122+
print(f'{"="*60}')
123+
sys.exit(1)
124+
else:
125+
print(f'\n{"="*60}')
126+
print('All datasets downloaded successfully!')
127+
print(f'{"="*60}')
128+
EOF
129+
69130
- name: Build docs
131+
env:
132+
# Disable parallel builds when cache is cold to avoid Zenodo rate limiting
133+
SPHINX_GALLERY_PARALLEL: ${{ steps.cache-mne_data.outputs.cache-matched-key != '' }}
134+
SPHINX_JOBS: ${{ steps.cache-mne_data.outputs.cache-matched-key != '' && 'auto' || '1' }}
70135
run: |
136+
echo "Cache matched key: ${{ steps.cache-mne_data.outputs.cache-matched-key }}"
137+
echo "Parallel gallery builds: $SPHINX_GALLERY_PARALLEL"
138+
echo "Sphinx jobs: $SPHINX_JOBS"
71139
cd docs && make html
72140
73141
- name: Generate notebooks from examples (Colab)
@@ -84,6 +152,13 @@ jobs:
84152
python .github/scripts/convert_to_notebook.py --input "$f" --output "$out_path"
85153
done
86154
155+
- name: Save MNE Data Cache
156+
if: success()
157+
uses: actions/cache/save@v4
158+
with:
159+
path: ~/mne_data
160+
key: doc-${{ runner.os }}-mne-data-v2-${{ github.run_id }}
161+
87162
# Create an artifact of the html output.
88163
- uses: actions/upload-artifact@v4
89164
with:

.github/workflows/test.yml

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,72 @@ jobs:
3636
enable-cache: true
3737

3838
# Cache MNE Data
39-
- name: Create/Restore MNE Data Cache
39+
- name: Restore MNE Data Cache
4040
id: cache-mne_data
41-
uses: actions/cache@v4
41+
uses: actions/cache/restore@v4
4242
with:
4343
path: ~/mne_data
44-
key: ${{ runner.os }}-mne-data-v1
44+
key: ${{ runner.os }}-mne-data-v2-${{ github.run_id }}
4545
restore-keys: |
46-
${{ runner.os }}-mne-data-v1
46+
${{ runner.os }}-mne-data-v2-
4747
4848
- name: Install moabb
4949
run: |
5050
uv pip install -e .[tests,deeplearning,optuna]
5151
52+
- name: Pre-download BNCI datasets (cold cache only)
53+
if: steps.cache-mne_data.outputs.cache-matched-key == ''
54+
run: |
55+
echo "Cache is cold, pre-downloading BNCI datasets..."
56+
python << 'EOF'
57+
import sys
58+
import time
59+
60+
datasets_to_download = [
61+
("moabb.datasets", "BNCI2014_001", None),
62+
("moabb.datasets", "BNCI2015_001", None),
63+
]
64+
65+
failed = []
66+
for module_path, class_name, n_subjects in datasets_to_download:
67+
print(f"\n{'='*60}")
68+
print(f"Pre-downloading {class_name}...")
69+
print(f"{'='*60}")
70+
success = False
71+
for attempt in range(5):
72+
try:
73+
import importlib
74+
module = importlib.import_module(module_path)
75+
cls = getattr(module, class_name)
76+
ds = cls()
77+
subjects = ds.subject_list if n_subjects is None else ds.subject_list[:n_subjects]
78+
ds.download(subject_list=subjects)
79+
print(f"SUCCESS: {class_name} downloaded")
80+
success = True
81+
break
82+
except Exception as e:
83+
print(f"Attempt {attempt + 1}/5 failed: {e}")
84+
if attempt < 4:
85+
wait = 90 * (attempt + 1)
86+
print(f"Waiting {wait}s before retry...")
87+
time.sleep(wait)
88+
if not success:
89+
failed.append(class_name)
90+
print(f"FAILED: {class_name} after 5 attempts")
91+
print("Waiting 60s before next dataset...")
92+
time.sleep(60)
93+
94+
if failed:
95+
print(f"\n{'='*60}")
96+
print(f"ERROR: Failed to download: {', '.join(failed)}")
97+
print(f"{'='*60}")
98+
sys.exit(1)
99+
else:
100+
print(f"\n{'='*60}")
101+
print("All BNCI datasets downloaded successfully!")
102+
print(f"{'='*60}")
103+
EOF
104+
52105
- name: Run tests
53106
run: |
54107
echo "Running tests"
@@ -66,3 +119,10 @@ jobs:
66119
directory: /home/runner/work/moabb/moabb
67120
files: ./.coverage,coverage.xml
68121
env_vars: OS,PYTHON
122+
123+
- name: Save MNE Data Cache
124+
if: success()
125+
uses: actions/cache/save@v4
126+
with:
127+
path: ~/mne_data
128+
key: ${{ runner.os }}-mne-data-v2-${{ github.run_id }}

docs/Makefile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# Minimal Makefile for Sphinx documentation
22
#
3-
# Variables (can be overridden from the command line)
4-
SPHINXOPTS = -j auto
3+
# Variables (can be overridden from the command line or environment)
4+
# Use SPHINX_JOBS env var to control parallelism (default: auto)
5+
SPHINX_JOBS ?= auto
6+
SPHINXOPTS ?= -j $(SPHINX_JOBS)
57
SPHINXBUILD = sphinx-build
68
SPHINXPROJ = moabb
79
SOURCEDIR = source

docs/source/conf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def linkcode_resolve(domain, info): # noqa: C901
180180
]
181181
),
182182
"within_subsection_order": "FileNameSortKey",
183-
"parallel": True,
183+
# Disable parallel when cache is cold to avoid Zenodo rate limiting
184+
"parallel": os.environ.get("SPHINX_GALLERY_PARALLEL", "true").lower() == "true",
184185
}
185186

186187

docs/source/whats_new.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,12 @@ Bugs
4242
- Correct :class:`moabb.pipelines.classification.SSVEP_CCA`, :class:`moabb.pipelines.classification.SSVEP_TRCA` and :class:`moabb.pipelines.classification.SSVEP_MsetCCA` behavior (:gh:`625` by `Sylvain Chevallier`_)
4343
- Fix scikit-learn LogisticRegression elasticnet penalty parameter deprecation by re-adding `penalty='elasticnet'` for ElasticNet configurations with `0 < l1_ratio < 1` (:gh:`869` by `Bruno Aristimunha`_)
4444
- Fixing option to pickle model (:gh:`870` by `Ethan Davis`_)
45+
- Normalize Zenodo download paths and add a custom user-agent to improve download robustness (:gh:`946` by `Bruno Aristimunha`_)
46+
- Use the BNCI mirror host to avoid download timeouts (:gh:`946` by `Bruno Aristimunha`_)
4547

4648
Code health
4749
~~~~~~~~~~~
48-
- None yet.
50+
- Persist docs/test CI MNE dataset cache across runs to reduce cold-cache downloads (:gh:`946` by `Bruno Aristimunha`_)
4951

5052
Version 1.4.3 (Stable - PyPi)
5153
-------------------------------

moabb/datasets/Zhou2016.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from .base import BaseBIDSDataset
1616
from .bids_interface import get_bids_root
17-
from .download import download_if_missing
17+
from .download import download_if_missing, get_user_agent
1818

1919

2020
log = logging.getLogger(__name__)
@@ -126,7 +126,7 @@ def get_metainfo(self, path=None):
126126

127127
if not Path(file_path).exists():
128128
# If not found, fetch from Zenodo
129-
response = requests.get(ZENODO_URL)
129+
response = requests.get(ZENODO_URL, headers={"User-Agent": get_user_agent()})
130130
response.raise_for_status()
131131
# Save the response to a file
132132
with _open_lock(file_path, "w") as f:

moabb/datasets/bnci.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from moabb.utils import depreciated_alias
1717

1818

19-
BNCI_URL = "http://bnci-horizon-2020.eu/database/data-sets/"
19+
BNCI_URL = "https://lampx.tugraz.at/~bci/database/"
2020
BBCI_URL = "http://doc.ml.tu-berlin.de/bbci/"
2121

2222

moabb/datasets/download.py

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import os.path as osp
1010
from pathlib import Path
11+
from urllib.parse import urlparse
1112

1213
import pandas as pd
1314
import requests
@@ -22,6 +23,42 @@
2223
logger = logging.getLogger(__name__)
2324

2425

26+
def get_user_agent():
27+
"""Return a user agent string for outbound requests."""
28+
try:
29+
from importlib import metadata
30+
31+
version = metadata.version("moabb")
32+
return f"moabb/{version} (https://github.com/NeuroTechX/moabb)"
33+
except Exception:
34+
return "moabb (https://github.com/NeuroTechX/moabb)"
35+
36+
37+
def _set_user_agent(downloader):
38+
headers = downloader.kwargs.setdefault("headers", {})
39+
headers.setdefault("User-Agent", get_user_agent())
40+
41+
42+
def _sanitize_path(path: Path) -> Path:
43+
table = {ord(c): "-" for c in ':*?"<>|'}
44+
return Path(str(path).translate(table))
45+
46+
47+
def _normalize_destination(url: str, root: Path) -> Path:
48+
parsed = urlparse(url)
49+
if parsed.scheme in {"http", "https"} and parsed.netloc == "zenodo.org":
50+
parts = [p for p in parsed.path.split("/") if p]
51+
if len(parts) >= 4 and parts[0] in {"record", "records"} and parts[2] == "files":
52+
record_id = parts[1]
53+
fname = parts[-1]
54+
return root / "zenodo" / record_id / fname
55+
if len(parts) >= 5 and parts[0] == "api" and parts[1] == "records":
56+
record_id = parts[2]
57+
fname = parts[-1]
58+
return root / "zenodo" / record_id / fname
59+
return Path(_url_to_local_path(url, root))
60+
61+
2562
def get_dataset_path(sign, path):
2663
"""Returns the dataset path allowing for changes in MNE_DATA config.
2764
@@ -140,14 +177,20 @@ def data_dl(url, sign, path=None, force_update=False, verbose=None):
140177
"""
141178
path = Path(get_dataset_path(sign, path))
142179
key_dest = "MNE-{:s}-data".format(sign.lower())
143-
destination = _url_to_local_path(url, path / key_dest)
144-
destination = str(path) + destination.split(str(path))[1]
145-
table = {ord(c): "-" for c in ':*?"<>|'}
146-
destination = Path(str(path) + destination.split(str(path))[1].translate(table))
180+
root = path / key_dest
181+
destination = _sanitize_path(_normalize_destination(url, root))
182+
legacy_destination = _sanitize_path(Path(_url_to_local_path(url, root)))
183+
if legacy_destination.exists() and not destination.exists():
184+
destination.parent.mkdir(parents=True, exist_ok=True)
185+
try:
186+
legacy_destination.replace(destination)
187+
except OSError:
188+
destination = legacy_destination
147189

148190
downloader = choose_downloader(url, progressbar=True)
149191
if type(downloader).__name__ in ["HTTPDownloader", "DOIDownloader"]:
150192
downloader.kwargs.setdefault("verify", False)
193+
_set_user_agent(downloader)
151194

152195
# Fetch the file
153196
if not destination.is_file() or force_update:
@@ -160,7 +203,7 @@ def data_dl(url, sign, path=None, force_update=False, verbose=None):
160203
dlpath = retrieve(
161204
url,
162205
known_hash,
163-
fname=Path(url).name,
206+
fname=destination.name,
164207
path=str(destination.parent),
165208
progressbar=True,
166209
downloader=downloader,
@@ -322,6 +365,9 @@ def download_if_missing(file_path, url, warn_missing=True, verbose=True):
322365
warn(f"{file_path} not found. Downloading from {url}")
323366

324367
downloader = choose_downloader(url, progressbar=verbose)
368+
if type(downloader).__name__ in ["HTTPDownloader", "DOIDownloader"]:
369+
downloader.kwargs.setdefault("verify", False)
370+
_set_user_agent(downloader)
325371

326372
path = retrieve(
327373
url,

0 commit comments

Comments
 (0)