ecmwf
diff --git a/‎.github/CODEOWNERS
Lines changed: 7 additions & 5 deletions b/‎.github/CODEOWNERS
Lines changed: 7 additions & 5 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 2 additions & 3 deletions b/‎.pre-commit-config.yaml
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/howtos/usage/02-coutout-complement-combination.rst
Lines changed: 26 additions & 0 deletions b/‎docs/howtos/usage/02-coutout-complement-combination.rst
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/howtos/usage/code/cutout-complement1.py
Lines changed: 17 additions & 0 deletions b/‎docs/howtos/usage/code/cutout-complement1.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎docs/howtos/usage/yaml/cutout-complement1.yaml
Lines changed: 11 additions & 0 deletions b/‎docs/howtos/usage/yaml/cutout-complement1.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/anemoi/datasets/commands/check.py
Lines changed: 62 additions & 0 deletions b/‎src/anemoi/datasets/commands/check.py
Lines changed: 62 additions & 0 deletions
diff --git a/‎src/anemoi/datasets/commands/copy.py
Lines changed: 36 additions & 3 deletions b/‎src/anemoi/datasets/commands/copy.py
Lines changed: 36 additions & 3 deletions
diff --git a/‎src/anemoi/datasets/commands/create.py
Lines changed: 2 additions & 3 deletions b/‎src/anemoi/datasets/commands/create.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/anemoi/datasets/commands/scan.py
Lines changed: 17 additions & 5 deletions b/‎src/anemoi/datasets/commands/scan.py
Lines changed: 17 additions & 5 deletions
diff --git a/‎src/anemoi/datasets/create/__init__.py
Lines changed: 17 additions & 7 deletions b/‎src/anemoi/datasets/create/__init__.py
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/anemoi/datasets/create/check.py
Lines changed: 19 additions & 1 deletion b/‎src/anemoi/datasets/create/check.py
Lines changed: 19 additions & 1 deletion
@@ -1,6 +1,8 @@
-# CODEOWNERS file
 
-# Protect workflow files
-/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
-/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
-/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
+# Workflows
+/.github/ @ecmwf/AnemoiSecurity
+
+# Project configs
+/pyproject.toml @ecmwf/AnemoiSecurity
+/.pre-commit-config.yaml @ecmwf/AnemoiSecurity
+/.release-please-config.json @ecmwf/AnemoiSecurity
@@ -40,14 +40,13 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.9
+  rev: v0.11.4
   hooks:
   - id: ruff
     args:
     - --line-length=120
     - --fix
     - --exit-non-zero-on-fix
-    - --preview
     - --exclude=docs/**/*_.py
 - repo: https://github.com/sphinx-contrib/sphinx-lint
   rev: v1.0.0
@@ -69,7 +68,7 @@ repos:
   hooks:
   - id: pyproject-fmt
 -   repo: https://github.com/jshwi/docsig # Check docstrings against function sig
-    rev: v0.69.1
+    rev: v0.69.3
     hooks:
     -   id: docsig
         args:
 
@@ -0,0 +1,26 @@
+.. _complement-step:
+
+##############################################
+ Combining cutout with complementing datasets
+##############################################
+
+Here we explain how to combine a cutout with a complementing dataset.
+
+****************************
+ Interpolate to cutout grid
+****************************
+
+In this case, we will use the a ``lam-dataset`` in a different grid that
+contains just one variable (``tp`` in the example below) and a
+``global-dataset``. What we want to do is to interpolate the
+``global-dataset`` to the resulting dataset from the cutout grid
+operation.
+
+.. literalinclude:: code/cutout-complement1.py
+
+or for the config file:
+
+.. literalinclude:: yaml/cutout-complement1.yaml
+
+The ``adjust`` option is in case the end or start dates do not exactly
+match.
@@ -0,0 +1,17 @@
+from anemoi.datasets import open_dataset
+
+ds = open_dataset(
+    complement={
+        "cutout": [
+            "lam-dataset",
+            {
+                "dataset": "global-dataset",
+                "select": ["tp"],
+            },
+        ],
+        "min_distance_km": 1,
+        "adjust": "dates",
+    },
+    source="global-dataset",
+    interpolation="nearest",
+)
@@ -0,0 +1,11 @@
+dataset:
+   complement:
+      dataset:
+         cutout:
+          - lam-dataset
+          - dataset: global-dataset
+            select: [ tp ]
+         min_distance_km: 1
+         adjust: dates
+   source: global-dataset
+   interpolation: nearest
@@ -50,7 +50,7 @@ dynamic = [
 ]
 dependencies = [
   "anemoi-transform>=0.1.9",
-  "anemoi-utils[provenance]>=0.4.19",
+  "anemoi-utils[provenance]>=0.4.21",
   "cfunits",
   "numcodecs<0.16",                   # Until we move to zarr3
   "numpy",
@@ -109,6 +109,7 @@ optional-dependencies.xarray = [
   "pandas",
   "planetary-computer",
   "pystac-client",
+  "s3fs>=0.5",
 ]
 
 urls.Changelog = "https://github.com/ecmwf/anemoi-datasets/CHANGELOG.md"
 
@@ -0,0 +1,62 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import logging
+import os
+from typing import Any
+
+import yaml
+
+from anemoi.datasets.create.check import DatasetName
+
+from . import Command
+
+LOG = logging.getLogger(__name__)
+
+
+class Check(Command):
+    """Check if a dataset name follow naming conventions."""
+
+    timestamp = True
+
+    def add_arguments(self, command_parser: Any) -> None:
+        """Add command line arguments to the parser.
+
+        Parameters
+        ----------
+        command_parser : Any
+            The command line argument parser.
+        """
+        command_parser.add_argument(
+            "--recipe",
+            help="",
+        )
+        command_parser.add_argument(
+            "--name",
+            help="",
+        )
+
+    def run(self, args: Any) -> None:
+
+        if args.recipe:
+            recipe_filename = os.path.basename(args.recipe)
+            recipe_name = os.path.splitext(recipe_filename)[0]
+            in_recipe_name = yaml.safe_load(open(args.recipe, "r", encoding="utf-8"))["name"]
+            if recipe_name != in_recipe_name:
+                print(f"Recipe name {recipe_name} does not match the name in the recipe file {in_recipe_name}")
+
+            name = in_recipe_name
+            DatasetName(name=name).raise_if_not_valid()
+
+        if args.name:
+            name = args.name
+            DatasetName(name=name).raise_if_not_valid()
+
+
+command = Check
@@ -319,10 +319,30 @@ def copy_group(self, source: Any, target: Any, _copy: Any, verbosity: int) -> No
         """
         import zarr
 
+        if self.verbosity > 0:
+            LOG.info(f"Copying group {source} to {target}")
+
         for k, v in source.attrs.items():
+            if self.verbosity > 1:
+                import textwrap
+
+                LOG.info(f"Copying attribute {k} = {textwrap.shorten(str(v), 40)}")
             target.attrs[k] = v
 
-        for name in sorted(source.keys()):
+        source_keys = list(source.keys())
+
+        if not source_keys:
+            raise ValueError(f"Source group {source} is empty.")
+
+        if self.verbosity > 1:
+            LOG.info(f"Keys {source_keys}")
+
+        for name in sorted(source_keys):
+            if name.startswith("."):
+                if self.verbosity > 1:
+                    LOG.info(f"Skipping {name}")
+                continue
+
             if isinstance(source[name], zarr.hierarchy.Group):
                 group = target[name] if name in target else target.create_group(name)
                 self.copy_group(
@@ -362,6 +382,11 @@ def copy(self, source: Any, target: Any, verbosity: int) -> None:
         _copy = target["_copy"]
         _copy_np = _copy[:]
 
+        if self.verbosity > 1:
+            import numpy as np
+
+            LOG.info(f"copy {np.sum(_copy_np)} of {len(_copy_np)}")
+
         self.copy_group(source, target, _copy_np, verbosity)
         del target["_copy"]
 
@@ -417,11 +442,19 @@ def open_target() -> Any:
             LOG.error("Target already exists, use either --overwrite or --resume.")
             sys.exit(1)
 
+        if self.verbosity > 0:
+            LOG.info(f"Open target: {self.target}")
+
         target = open_target()
 
         assert target is not None, target
 
+        if self.verbosity > 0:
+            LOG.info(f"Open source: {self.source}")
+
         source = zarr.open(self._store(self.source), mode="r")
+        # zarr.consolidate_metadata(source)
+
         self.copy(source, target, self.verbosity)
 
 
@@ -488,8 +521,8 @@ def run(self, args: Any) -> None:
                 if args.source.startswith("s3://") and not args.source.endswith("/"):
                     args.source = args.source + "/"
                 copier = Transfer(
-                    args.source,
-                    args.target,
+                    source=args.source,
+                    target=args.target,
                     overwrite=args.overwrite,
                     resume=args.resume,
                     verbosity=args.verbosity,
 
@@ -180,10 +180,9 @@ def parallel_create(self, args: Any) -> None:
             executor.submit(task, "init-additions", options).result()
 
         with ExecutorClass(max_workers=parallel) as executor:
-            opt = options.copy()
-            opt["parts"] = f"{n+1}/{total}"
-            futures.append(executor.submit(task, "load", opt))
             for n in range(total):
+                opt = options.copy()
+                opt["parts"] = f"{n+1}/{total}"
                 futures.append(executor.submit(task, "load-additions", opt))
 
             for future in tqdm.tqdm(
 
@@ -23,6 +23,16 @@
 
 
 class Scan(Command):
+    """Command to scan files and generate a configuration file.
+
+    Attributes
+    ----------
+    internal : bool
+        Indicates whether the command is internal.
+    timestamp : bool
+        Indicates whether to include a timestamp.
+    """
+
     internal = True
     timestamp = True
 
@@ -32,8 +42,9 @@ def add_arguments(self, command_parser: Any) -> None:
         Parameters
         ----------
         command_parser : Any
-            The command parser to which arguments are added.
+            The command-line argument parser.
         """
+
         command_parser.add_argument(
             "--match",
             help="Give a glob pattern to match files (default: *.grib)",
@@ -51,22 +62,23 @@ def run(self, args: Any) -> None:
         Parameters
         ----------
         args : Any
-            The arguments passed to the command.
+            Parsed command-line arguments.
         """
 
         def match(path: str) -> bool:
-            """Check if a path matches the given pattern.
+            """Check if a file path matches the given glob pattern.
 
             Parameters
             ----------
             path : str
-                The path to check.
+                The file path to check.
 
             Returns
             -------
             bool
-                True if the path matches, False otherwise.
+                True if the path matches the pattern, False otherwise.
             """
+
             return fnmatch.fnmatch(path, args.match)
 
         paths = []
 
@@ -938,13 +938,23 @@ def check_shape(cube, dates, dates_in_data):
         check_shape(cube, dates, dates_in_data)
 
         def check_dates_in_data(dates_in_data, requested_dates):
-            requested_dates = [np.datetime64(_) for _ in requested_dates]
-            dates_in_data = [np.datetime64(_) for _ in dates_in_data]
-            assert dates_in_data == requested_dates, (
-                "Dates in data are not the requested ones:",
-                dates_in_data,
-                requested_dates,
-            )
+            _requested_dates = [np.datetime64(_) for _ in requested_dates]
+            _dates_in_data = [np.datetime64(_) for _ in dates_in_data]
+            if _dates_in_data != _requested_dates:
+                LOG.error("Dates in data are not the requested ones:")
+
+                dates_in_data = set(dates_in_data)
+                requested_dates = set(requested_dates)
+
+                missing = sorted(requested_dates - dates_in_data)
+                extra = sorted(dates_in_data - requested_dates)
+
+                if missing:
+                    LOG.error(f"Missing dates: {[_.isoformat() for _ in missing]}")
+                if extra:
+                    LOG.error(f"Extra dates: {[_.isoformat() for _ in extra]}")
+
+                raise ValueError("Dates in data are not the requested ones")
 
         check_dates_in_data(dates_in_data, dates)
 
 
@@ -18,14 +18,15 @@
 from typing import Union
 
 import numpy as np
+from anemoi.utils.config import load_config
 from anemoi.utils.dates import frequency_to_string
 from numpy.typing import NDArray
 
 LOG = logging.getLogger(__name__)
 
 
 class DatasetName:
-    """Class to validate and parse dataset names according to naming conventions."""
+    """Validate and parse dataset names according to naming conventions."""
 
     def __init__(
         self,
@@ -58,6 +59,14 @@ def __init__(
 
         self.messages = []
 
+        config = load_config().get("datasets", {})
+
+        if config.get("ignore_naming_conventions", False):
+            # setting the env variable ANEMOI_CONFIG_DATASETS_IGNORE_NAMING_CONVENTIONS=1
+            # will ignore the naming conventions
+            return
+
+        self.check_characters()
         self.check_parsed()
         self.check_resolution(resolution)
         self.check_frequency(frequency)
@@ -157,6 +166,15 @@ def check_resolution(self, resolution: Optional[str]) -> None:
         self._check_missing("resolution", resolution_str)
         self._check_mismatch("resolution", resolution_str)
 
+    def check_characters(self) -> None:
+        if not self.name.islower():
+            self.messages.append(f"the {self.name} should be in lower case.")
+        if "_" in self.name:
+            self.messages.append(f"the {self.name} should use '-' instead of '_'.")
+        for c in self.name:
+            if not c.isalnum() and c not in "-":
+                self.messages.append(f"the {self.name} should only contain alphanumeric characters and '-'.")
+
     def check_frequency(self, frequency: Optional[datetime.timedelta]) -> None:
         """Check if the frequency matches the expected format.