Fix docs workflow cache key to enable cache reuse (#946)

bruAristimunha · pre-commit-ci[bot] · web-flow · commit 2a9cf4f5fdd7 · 2026-01-25T09:03:00.000+01:00
* Fix docs workflow cache key and disable parallel on cold cache

The MNE data cache key was using github.run_id, which creates a unique
key for every workflow run. This prevented cache reuse, causing every
docs build to download all datasets fresh from Zenodo.

Changes:
- Use static versioned cache key (v2) that persists across runs
- Disable parallel Sphinx gallery builds when cache is cold to avoid
  Zenodo rate limiting during the initial dataset download
- Add SPHINX_GALLERY_PARALLEL env var to control parallel execution

When cache is hit, parallel builds are enabled for speed. When cache
is cold (first run), sequential builds prevent rate limiting.

* Add pre-download step with retry logic for cold cache

When the MNE data cache is cold, pre-download datasets with:
- Exponential backoff retry for Zenodo API rate limiting
- Pre-caching of Zenodo metadata JSON
- 30s delays between dataset downloads

This ensures the first docs build succeeds by handling rate limiting
gracefully before sphinx-gallery runs.

* Fix Zhou2016 metadata cache path to use BIDS format

* Simplify pre-download step to use MOABB dataset classes with retry

* Expand pre-download to include all Zenodo datasets used in examples

* Disable Sphinx parallel build on cold cache via SPHINX_JOBS env var

* Make pre-download step more robust and fail on errors

- Increase retries from 3 to 5 with longer delays (90s-360s)
- Increase wait between datasets from 45s to 60s
- Download all 4 Zhou2016 subjects
- Fail workflow if any dataset download fails
- Better logging with clear success/failure messages

* Improve docs cache persistence and Zenodo downloads

* [pre-commit.ci] auto fixes from pre-commit.com hooks

* Use HTTPS for BNCI dataset downloads

* Cache BNCI datasets in test workflow

* Switch BNCI downloads to mirror host

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -32,18 +32,18 @@ jobs:
           python-version: ${{ matrix.python-version }}
           enable-cache: true
 
-      - name: Create/Restore MNE Data Cache
+      - name: Restore MNE Data Cache
         id: cache-mne_data
-        uses: actions/cache@v4
+        uses: actions/cache/restore@v4
         with:
           path: ~/mne_data
-          key: doc-${{ runner.os }}-mne-data-${{ github.run_id }}
+          key: doc-${{ runner.os }}-mne-data-v2-${{ github.run_id }}
           restore-keys: |
-            doc-${{ runner.os }}-mne-data-
+            doc-${{ runner.os }}-mne-data-v2-
 
       - name: Clean up corrupted cache
         run: |
-          echo "Cache hit: ${{ steps.cache-mne_data.outputs.cache-hit }}"
+          echo "Cache matched key: ${{ steps.cache-mne_data.outputs.cache-matched-key }}"
           mkdir -p ~/mne_data
           # Remove any incomplete extractions (e.g., BIDS.zip.unzip alongside final folder)
           if [ -d ~/mne_data/BIDS.zip.unzip ]; then
@@ -66,8 +66,76 @@ jobs:
         run: |
           uv pip install -e .[docs,deeplearning,optuna,external,carbonemission]
 
+      - name: Pre-download datasets (cold cache only)
+        if: steps.cache-mne_data.outputs.cache-matched-key == ''
+        run: |
+          echo "Cache is cold, pre-downloading datasets with delays to avoid rate limiting..."
+          python << 'EOF'
+          import sys
+          import time
+
+          # All datasets used in examples that download from Zenodo
+          datasets_to_download = [
+              # (module_path, class_name, num_subjects)
+              ('moabb.datasets', 'BNCI2014_001', 3),
+              ('moabb.datasets', 'BNCI2014_009', 2),
+              ('moabb.datasets', 'Zhou2016', 4),  # All 4 subjects
+              ('moabb.datasets', 'AlexMI', 1),
+              ('moabb.datasets', 'Kalunga2016', 2),
+              ('moabb.datasets', 'Cattan2019_VR', 1),
+              ('moabb.datasets', 'Hinss2021', 1),
+          ]
+
+          failed = []
+          for module_path, class_name, n_subjects in datasets_to_download:
+              print(f'\n{"="*60}')
+              print(f'Pre-downloading {class_name} ({n_subjects} subjects)...')
+              print(f'{"="*60}')
+              success = False
+              for attempt in range(5):  # More retries
+                  try:
+                      import importlib
+                      module = importlib.import_module(module_path)
+                      cls = getattr(module, class_name)
+                      ds = cls()
+                      subjects = ds.subject_list[:n_subjects]
+                      ds.download(subject_list=subjects)
+                      print(f'SUCCESS: {class_name} downloaded ({n_subjects} subjects)')
+                      success = True
+                      break
+                  except Exception as e:
+                      print(f'Attempt {attempt + 1}/5 failed: {e}')
+                      if attempt < 4:
+                          wait = 90 * (attempt + 1)  # 90s, 180s, 270s, 360s
+                          print(f'Waiting {wait}s before retry...')
+                          time.sleep(wait)
+              if not success:
+                  failed.append(class_name)
+                  print(f'FAILED: {class_name} after 5 attempts')
+              # Wait between datasets to avoid rate limiting
+              print('Waiting 60s before next dataset...')
+              time.sleep(60)
+
+          if failed:
+              print(f'\n{"="*60}')
+              print(f'ERROR: Failed to download: {", ".join(failed)}')
+              print(f'{"="*60}')
+              sys.exit(1)
+          else:
+              print(f'\n{"="*60}')
+              print('All datasets downloaded successfully!')
+              print(f'{"="*60}')
+          EOF
+
       - name: Build docs
+        env:
+          # Disable parallel builds when cache is cold to avoid Zenodo rate limiting
+          SPHINX_GALLERY_PARALLEL: ${{ steps.cache-mne_data.outputs.cache-matched-key != '' }}
+          SPHINX_JOBS: ${{ steps.cache-mne_data.outputs.cache-matched-key != '' && 'auto' || '1' }}
         run: |
+          echo "Cache matched key: ${{ steps.cache-mne_data.outputs.cache-matched-key }}"
+          echo "Parallel gallery builds: $SPHINX_GALLERY_PARALLEL"
+          echo "Sphinx jobs: $SPHINX_JOBS"
           cd docs && make html
 
       - name: Generate notebooks from examples (Colab)
@@ -84,6 +152,13 @@ jobs:
             python .github/scripts/convert_to_notebook.py --input "$f" --output "$out_path"
           done
 
+      - name: Save MNE Data Cache
+        if: success()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/mne_data
+          key: doc-${{ runner.os }}-mne-data-v2-${{ github.run_id }}
+
       # Create an artifact of the html output.
       - uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -36,19 +36,72 @@ jobs:
           enable-cache: true
 
       # Cache MNE Data
-      - name: Create/Restore MNE Data Cache
+      - name: Restore MNE Data Cache
         id: cache-mne_data
-        uses: actions/cache@v4
+        uses: actions/cache/restore@v4
         with:
           path: ~/mne_data
-          key: ${{ runner.os }}-mne-data-v1
+          key: ${{ runner.os }}-mne-data-v2-${{ github.run_id }}
           restore-keys: |
-            ${{ runner.os }}-mne-data-v1
+            ${{ runner.os }}-mne-data-v2-
 
       - name: Install moabb
         run: |
           uv pip install -e .[tests,deeplearning,optuna]
 
+      - name: Pre-download BNCI datasets (cold cache only)
+        if: steps.cache-mne_data.outputs.cache-matched-key == ''
+        run: |
+          echo "Cache is cold, pre-downloading BNCI datasets..."
+          python << 'EOF'
+          import sys
+          import time
+
+          datasets_to_download = [
+              ("moabb.datasets", "BNCI2014_001", None),
+              ("moabb.datasets", "BNCI2015_001", None),
+          ]
+
+          failed = []
+          for module_path, class_name, n_subjects in datasets_to_download:
+              print(f"\n{'='*60}")
+              print(f"Pre-downloading {class_name}...")
+              print(f"{'='*60}")
+              success = False
+              for attempt in range(5):
+                  try:
+                      import importlib
+                      module = importlib.import_module(module_path)
+                      cls = getattr(module, class_name)
+                      ds = cls()
+                      subjects = ds.subject_list if n_subjects is None else ds.subject_list[:n_subjects]
+                      ds.download(subject_list=subjects)
+                      print(f"SUCCESS: {class_name} downloaded")
+                      success = True
+                      break
+                  except Exception as e:
+                      print(f"Attempt {attempt + 1}/5 failed: {e}")
+                      if attempt < 4:
+                          wait = 90 * (attempt + 1)
+                          print(f"Waiting {wait}s before retry...")
+                          time.sleep(wait)
+              if not success:
+                  failed.append(class_name)
+                  print(f"FAILED: {class_name} after 5 attempts")
+              print("Waiting 60s before next dataset...")
+              time.sleep(60)
+
+          if failed:
+              print(f"\n{'='*60}")
+              print(f"ERROR: Failed to download: {', '.join(failed)}")
+              print(f"{'='*60}")
+              sys.exit(1)
+          else:
+              print(f"\n{'='*60}")
+              print("All BNCI datasets downloaded successfully!")
+              print(f"{'='*60}")
+          EOF
+
       - name: Run tests
         run: |
           echo "Running tests"
@@ -66,3 +119,10 @@ jobs:
           directory: /home/runner/work/moabb/moabb
           files: ./.coverage,coverage.xml
           env_vars: OS,PYTHON
+
+      - name: Save MNE Data Cache
+        if: success()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/mne_data
+          key: ${{ runner.os }}-mne-data-v2-${{ github.run_id }}
diff --git a/docs/Makefile b/docs/Makefile
@@ -1,7 +1,9 @@
 # Minimal Makefile for Sphinx documentation
 #
-# Variables (can be overridden from the command line)
-SPHINXOPTS    = -j auto
+# Variables (can be overridden from the command line or environment)
+# Use SPHINX_JOBS env var to control parallelism (default: auto)
+SPHINX_JOBS   ?= auto
+SPHINXOPTS    ?= -j $(SPHINX_JOBS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = moabb
 SOURCEDIR     = source
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -180,7 +180,8 @@ def linkcode_resolve(domain, info):  # noqa: C901
         ]
     ),
     "within_subsection_order": "FileNameSortKey",
-    "parallel": True,
+    # Disable parallel when cache is cold to avoid Zenodo rate limiting
+    "parallel": os.environ.get("SPHINX_GALLERY_PARALLEL", "true").lower() == "true",
 }
 
 
diff --git a/docs/source/whats_new.rst b/docs/source/whats_new.rst
@@ -42,10 +42,12 @@ Bugs
 - Correct :class:`moabb.pipelines.classification.SSVEP_CCA`, :class:`moabb.pipelines.classification.SSVEP_TRCA` and :class:`moabb.pipelines.classification.SSVEP_MsetCCA` behavior (:gh:`625` by `Sylvain Chevallier`_)
 - Fix scikit-learn LogisticRegression elasticnet penalty parameter deprecation by re-adding `penalty='elasticnet'` for ElasticNet configurations with `0 < l1_ratio < 1` (:gh:`869` by `Bruno Aristimunha`_)
 - Fixing option to pickle model (:gh:`870` by `Ethan Davis`_)
+- Normalize Zenodo download paths and add a custom user-agent to improve download robustness (:gh:`946` by `Bruno Aristimunha`_)
+- Use the BNCI mirror host to avoid download timeouts (:gh:`946` by `Bruno Aristimunha`_)
 
 Code health
 ~~~~~~~~~~~
-- None yet.
+- Persist docs/test CI MNE dataset cache across runs to reduce cold-cache downloads (:gh:`946` by `Bruno Aristimunha`_)
 
 Version 1.4.3 (Stable - PyPi)
 -------------------------------
diff --git a/moabb/datasets/Zhou2016.py b/moabb/datasets/Zhou2016.py
@@ -14,7 +14,7 @@
 
 from .base import BaseBIDSDataset
 from .bids_interface import get_bids_root
-from .download import download_if_missing
+from .download import download_if_missing, get_user_agent
 
 
 log = logging.getLogger(__name__)
@@ -126,7 +126,7 @@ def get_metainfo(self, path=None):
 
         if not Path(file_path).exists():
             # If not found, fetch from Zenodo
-            response = requests.get(ZENODO_URL)
+            response = requests.get(ZENODO_URL, headers={"User-Agent": get_user_agent()})
             response.raise_for_status()
             # Save the response to a file
             with _open_lock(file_path, "w") as f:
diff --git a/moabb/datasets/bnci.py b/moabb/datasets/bnci.py
@@ -16,7 +16,7 @@
 from moabb.utils import depreciated_alias
 
 
-BNCI_URL = "http://bnci-horizon-2020.eu/database/data-sets/"
+BNCI_URL = "https://lampx.tugraz.at/~bci/database/"
 BBCI_URL = "http://doc.ml.tu-berlin.de/bbci/"
 
 
diff --git a/moabb/datasets/download.py b/moabb/datasets/download.py
@@ -8,6 +8,7 @@
 import os
 import os.path as osp
 from pathlib import Path
+from urllib.parse import urlparse
 
 import pandas as pd
 import requests
@@ -22,6 +23,42 @@
 logger = logging.getLogger(__name__)
 
 
+def get_user_agent():
+    """Return a user agent string for outbound requests."""
+    try:
+        from importlib import metadata
+
+        version = metadata.version("moabb")
+        return f"moabb/{version} (https://github.com/NeuroTechX/moabb)"
+    except Exception:
+        return "moabb (https://github.com/NeuroTechX/moabb)"
+
+
+def _set_user_agent(downloader):
+    headers = downloader.kwargs.setdefault("headers", {})
+    headers.setdefault("User-Agent", get_user_agent())
+
+
+def _sanitize_path(path: Path) -> Path:
+    table = {ord(c): "-" for c in ':*?"<>|'}
+    return Path(str(path).translate(table))
+
+
+def _normalize_destination(url: str, root: Path) -> Path:
+    parsed = urlparse(url)
+    if parsed.scheme in {"http", "https"} and parsed.netloc == "zenodo.org":
+        parts = [p for p in parsed.path.split("/") if p]
+        if len(parts) >= 4 and parts[0] in {"record", "records"} and parts[2] == "files":
+            record_id = parts[1]
+            fname = parts[-1]
+            return root / "zenodo" / record_id / fname
+        if len(parts) >= 5 and parts[0] == "api" and parts[1] == "records":
+            record_id = parts[2]
+            fname = parts[-1]
+            return root / "zenodo" / record_id / fname
+    return Path(_url_to_local_path(url, root))
+
+
 def get_dataset_path(sign, path):
     """Returns the dataset path allowing for changes in MNE_DATA config.
 
@@ -140,14 +177,20 @@ def data_dl(url, sign, path=None, force_update=False, verbose=None):
     """
     path = Path(get_dataset_path(sign, path))
     key_dest = "MNE-{:s}-data".format(sign.lower())
-    destination = _url_to_local_path(url, path / key_dest)
-    destination = str(path) + destination.split(str(path))[1]
-    table = {ord(c): "-" for c in ':*?"<>|'}
-    destination = Path(str(path) + destination.split(str(path))[1].translate(table))
+    root = path / key_dest
+    destination = _sanitize_path(_normalize_destination(url, root))
+    legacy_destination = _sanitize_path(Path(_url_to_local_path(url, root)))
+    if legacy_destination.exists() and not destination.exists():
+        destination.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            legacy_destination.replace(destination)
+        except OSError:
+            destination = legacy_destination
 
     downloader = choose_downloader(url, progressbar=True)
     if type(downloader).__name__ in ["HTTPDownloader", "DOIDownloader"]:
         downloader.kwargs.setdefault("verify", False)
+    _set_user_agent(downloader)
 
     # Fetch the file
     if not destination.is_file() or force_update:
@@ -160,7 +203,7 @@ def data_dl(url, sign, path=None, force_update=False, verbose=None):
     dlpath = retrieve(
         url,
         known_hash,
-        fname=Path(url).name,
+        fname=destination.name,
         path=str(destination.parent),
         progressbar=True,
         downloader=downloader,
@@ -322,6 +365,9 @@ def download_if_missing(file_path, url, warn_missing=True, verbose=True):
             warn(f"{file_path} not found. Downloading from {url}")
 
         downloader = choose_downloader(url, progressbar=verbose)
+        if type(downloader).__name__ in ["HTTPDownloader", "DOIDownloader"]:
+            downloader.kwargs.setdefault("verify", False)
+        _set_user_agent(downloader)
 
         path = retrieve(
             url,

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,8 @@ def linkcode_resolve(domain, info): # noqa: C901`
`180`	`180`	`]`
`181`	`181`	`),`
`182`	`182`	`"within_subsection_order": "FileNameSortKey",`
`183`		`- "parallel": True,`
	`183`	`+ # Disable parallel when cache is cold to avoid Zenodo rate limiting`
	`184`	`+ "parallel": os.environ.get("SPHINX_GALLERY_PARALLEL", "true").lower() == "true",`
`184`	`185`	`}`
`185`	`186`
`186`	`187`