Skip to content

Commit 44212c0

Browse files
authored
Merge branch 'main' into feat/verify-dataset
2 parents 952226d + 8213d35 commit 44212c0

File tree

23 files changed

+723
-94
lines changed

23 files changed

+723
-94
lines changed

.github/CODEOWNERS

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
# CODEOWNERS file
21

3-
# Protect workflow files
4-
/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
5-
/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
6-
/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
2+
# Workflows
3+
/.github/ @ecmwf/AnemoiSecurity
4+
5+
# Project configs
6+
/pyproject.toml @ecmwf/AnemoiSecurity
7+
/.pre-commit-config.yaml @ecmwf/AnemoiSecurity
8+
/.release-please-config.json @ecmwf/AnemoiSecurity

.pre-commit-config.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,13 @@ repos:
4040
- --force-single-line-imports
4141
- --profile black
4242
- repo: https://github.com/astral-sh/ruff-pre-commit
43-
rev: v0.9.9
43+
rev: v0.11.4
4444
hooks:
4545
- id: ruff
4646
args:
4747
- --line-length=120
4848
- --fix
4949
- --exit-non-zero-on-fix
50-
- --preview
5150
- --exclude=docs/**/*_.py
5251
- repo: https://github.com/sphinx-contrib/sphinx-lint
5352
rev: v1.0.0
@@ -69,7 +68,7 @@ repos:
6968
hooks:
7069
- id: pyproject-fmt
7170
- repo: https://github.com/jshwi/docsig # Check docstrings against function sig
72-
rev: v0.69.1
71+
rev: v0.69.3
7372
hooks:
7473
- id: docsig
7574
args:
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
.. _complement-step:
2+
3+
##############################################
4+
Combining cutout with complementing datasets
5+
##############################################
6+
7+
Here we explain how to combine a cutout with a complementing dataset.
8+
9+
****************************
10+
Interpolate to cutout grid
11+
****************************
12+
13+
In this case, we will use the a ``lam-dataset`` in a different grid that
14+
contains just one variable (``tp`` in the example below) and a
15+
``global-dataset``. What we want to do is to interpolate the
16+
``global-dataset`` to the resulting dataset from the cutout grid
17+
operation.
18+
19+
.. literalinclude:: code/cutout-complement1.py
20+
21+
or for the config file:
22+
23+
.. literalinclude:: yaml/cutout-complement1.yaml
24+
25+
The ``adjust`` option is in case the end or start dates do not exactly
26+
match.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from anemoi.datasets import open_dataset
2+
3+
ds = open_dataset(
4+
complement={
5+
"cutout": [
6+
"lam-dataset",
7+
{
8+
"dataset": "global-dataset",
9+
"select": ["tp"],
10+
},
11+
],
12+
"min_distance_km": 1,
13+
"adjust": "dates",
14+
},
15+
source="global-dataset",
16+
interpolation="nearest",
17+
)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
dataset:
2+
complement:
3+
dataset:
4+
cutout:
5+
- lam-dataset
6+
- dataset: global-dataset
7+
select: [ tp ]
8+
min_distance_km: 1
9+
adjust: dates
10+
source: global-dataset
11+
interpolation: nearest

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ dynamic = [
5050
]
5151
dependencies = [
5252
"anemoi-transform>=0.1.9",
53-
"anemoi-utils[provenance]>=0.4.19",
53+
"anemoi-utils[provenance]>=0.4.21",
5454
"cfunits",
5555
"numcodecs<0.16", # Until we move to zarr3
5656
"numpy",
@@ -109,6 +109,7 @@ optional-dependencies.xarray = [
109109
"pandas",
110110
"planetary-computer",
111111
"pystac-client",
112+
"s3fs>=0.5",
112113
]
113114

114115
urls.Changelog = "https://github.com/ecmwf/anemoi-datasets/CHANGELOG.md"

src/anemoi/datasets/commands/check.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# (C) Copyright 2024 Anemoi contributors.
2+
#
3+
# This software is licensed under the terms of the Apache Licence Version 2.0
4+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5+
#
6+
# In applying this licence, ECMWF does not waive the privileges and immunities
7+
# granted to it by virtue of its status as an intergovernmental organisation
8+
# nor does it submit to any jurisdiction.
9+
10+
import logging
11+
import os
12+
from typing import Any
13+
14+
import yaml
15+
16+
from anemoi.datasets.create.check import DatasetName
17+
18+
from . import Command
19+
20+
LOG = logging.getLogger(__name__)
21+
22+
23+
class Check(Command):
24+
"""Check if a dataset name follow naming conventions."""
25+
26+
timestamp = True
27+
28+
def add_arguments(self, command_parser: Any) -> None:
29+
"""Add command line arguments to the parser.
30+
31+
Parameters
32+
----------
33+
command_parser : Any
34+
The command line argument parser.
35+
"""
36+
command_parser.add_argument(
37+
"--recipe",
38+
help="",
39+
)
40+
command_parser.add_argument(
41+
"--name",
42+
help="",
43+
)
44+
45+
def run(self, args: Any) -> None:
46+
47+
if args.recipe:
48+
recipe_filename = os.path.basename(args.recipe)
49+
recipe_name = os.path.splitext(recipe_filename)[0]
50+
in_recipe_name = yaml.safe_load(open(args.recipe, "r", encoding="utf-8"))["name"]
51+
if recipe_name != in_recipe_name:
52+
print(f"Recipe name {recipe_name} does not match the name in the recipe file {in_recipe_name}")
53+
54+
name = in_recipe_name
55+
DatasetName(name=name).raise_if_not_valid()
56+
57+
if args.name:
58+
name = args.name
59+
DatasetName(name=name).raise_if_not_valid()
60+
61+
62+
command = Check

src/anemoi/datasets/commands/copy.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,10 +319,30 @@ def copy_group(self, source: Any, target: Any, _copy: Any, verbosity: int) -> No
319319
"""
320320
import zarr
321321

322+
if self.verbosity > 0:
323+
LOG.info(f"Copying group {source} to {target}")
324+
322325
for k, v in source.attrs.items():
326+
if self.verbosity > 1:
327+
import textwrap
328+
329+
LOG.info(f"Copying attribute {k} = {textwrap.shorten(str(v), 40)}")
323330
target.attrs[k] = v
324331

325-
for name in sorted(source.keys()):
332+
source_keys = list(source.keys())
333+
334+
if not source_keys:
335+
raise ValueError(f"Source group {source} is empty.")
336+
337+
if self.verbosity > 1:
338+
LOG.info(f"Keys {source_keys}")
339+
340+
for name in sorted(source_keys):
341+
if name.startswith("."):
342+
if self.verbosity > 1:
343+
LOG.info(f"Skipping {name}")
344+
continue
345+
326346
if isinstance(source[name], zarr.hierarchy.Group):
327347
group = target[name] if name in target else target.create_group(name)
328348
self.copy_group(
@@ -362,6 +382,11 @@ def copy(self, source: Any, target: Any, verbosity: int) -> None:
362382
_copy = target["_copy"]
363383
_copy_np = _copy[:]
364384

385+
if self.verbosity > 1:
386+
import numpy as np
387+
388+
LOG.info(f"copy {np.sum(_copy_np)} of {len(_copy_np)}")
389+
365390
self.copy_group(source, target, _copy_np, verbosity)
366391
del target["_copy"]
367392

@@ -417,11 +442,19 @@ def open_target() -> Any:
417442
LOG.error("Target already exists, use either --overwrite or --resume.")
418443
sys.exit(1)
419444

445+
if self.verbosity > 0:
446+
LOG.info(f"Open target: {self.target}")
447+
420448
target = open_target()
421449

422450
assert target is not None, target
423451

452+
if self.verbosity > 0:
453+
LOG.info(f"Open source: {self.source}")
454+
424455
source = zarr.open(self._store(self.source), mode="r")
456+
# zarr.consolidate_metadata(source)
457+
425458
self.copy(source, target, self.verbosity)
426459

427460

@@ -488,8 +521,8 @@ def run(self, args: Any) -> None:
488521
if args.source.startswith("s3://") and not args.source.endswith("/"):
489522
args.source = args.source + "/"
490523
copier = Transfer(
491-
args.source,
492-
args.target,
524+
source=args.source,
525+
target=args.target,
493526
overwrite=args.overwrite,
494527
resume=args.resume,
495528
verbosity=args.verbosity,

src/anemoi/datasets/commands/create.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,9 @@ def parallel_create(self, args: Any) -> None:
180180
executor.submit(task, "init-additions", options).result()
181181

182182
with ExecutorClass(max_workers=parallel) as executor:
183-
opt = options.copy()
184-
opt["parts"] = f"{n+1}/{total}"
185-
futures.append(executor.submit(task, "load", opt))
186183
for n in range(total):
184+
opt = options.copy()
185+
opt["parts"] = f"{n+1}/{total}"
187186
futures.append(executor.submit(task, "load-additions", opt))
188187

189188
for future in tqdm.tqdm(

src/anemoi/datasets/commands/scan.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@
2323

2424

2525
class Scan(Command):
26+
"""Command to scan files and generate a configuration file.
27+
28+
Attributes
29+
----------
30+
internal : bool
31+
Indicates whether the command is internal.
32+
timestamp : bool
33+
Indicates whether to include a timestamp.
34+
"""
35+
2636
internal = True
2737
timestamp = True
2838

@@ -32,8 +42,9 @@ def add_arguments(self, command_parser: Any) -> None:
3242
Parameters
3343
----------
3444
command_parser : Any
35-
The command parser to which arguments are added.
45+
The command-line argument parser.
3646
"""
47+
3748
command_parser.add_argument(
3849
"--match",
3950
help="Give a glob pattern to match files (default: *.grib)",
@@ -51,22 +62,23 @@ def run(self, args: Any) -> None:
5162
Parameters
5263
----------
5364
args : Any
54-
The arguments passed to the command.
65+
Parsed command-line arguments.
5566
"""
5667

5768
def match(path: str) -> bool:
58-
"""Check if a path matches the given pattern.
69+
"""Check if a file path matches the given glob pattern.
5970
6071
Parameters
6172
----------
6273
path : str
63-
The path to check.
74+
The file path to check.
6475
6576
Returns
6677
-------
6778
bool
68-
True if the path matches, False otherwise.
79+
True if the path matches the pattern, False otherwise.
6980
"""
81+
7082
return fnmatch.fnmatch(path, args.match)
7183

7284
paths = []

src/anemoi/datasets/create/__init__.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -938,13 +938,23 @@ def check_shape(cube, dates, dates_in_data):
938938
check_shape(cube, dates, dates_in_data)
939939

940940
def check_dates_in_data(dates_in_data, requested_dates):
941-
requested_dates = [np.datetime64(_) for _ in requested_dates]
942-
dates_in_data = [np.datetime64(_) for _ in dates_in_data]
943-
assert dates_in_data == requested_dates, (
944-
"Dates in data are not the requested ones:",
945-
dates_in_data,
946-
requested_dates,
947-
)
941+
_requested_dates = [np.datetime64(_) for _ in requested_dates]
942+
_dates_in_data = [np.datetime64(_) for _ in dates_in_data]
943+
if _dates_in_data != _requested_dates:
944+
LOG.error("Dates in data are not the requested ones:")
945+
946+
dates_in_data = set(dates_in_data)
947+
requested_dates = set(requested_dates)
948+
949+
missing = sorted(requested_dates - dates_in_data)
950+
extra = sorted(dates_in_data - requested_dates)
951+
952+
if missing:
953+
LOG.error(f"Missing dates: {[_.isoformat() for _ in missing]}")
954+
if extra:
955+
LOG.error(f"Extra dates: {[_.isoformat() for _ in extra]}")
956+
957+
raise ValueError("Dates in data are not the requested ones")
948958

949959
check_dates_in_data(dates_in_data, dates)
950960

src/anemoi/datasets/create/check.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818
from typing import Union
1919

2020
import numpy as np
21+
from anemoi.utils.config import load_config
2122
from anemoi.utils.dates import frequency_to_string
2223
from numpy.typing import NDArray
2324

2425
LOG = logging.getLogger(__name__)
2526

2627

2728
class DatasetName:
28-
"""Class to validate and parse dataset names according to naming conventions."""
29+
"""Validate and parse dataset names according to naming conventions."""
2930

3031
def __init__(
3132
self,
@@ -58,6 +59,14 @@ def __init__(
5859

5960
self.messages = []
6061

62+
config = load_config().get("datasets", {})
63+
64+
if config.get("ignore_naming_conventions", False):
65+
# setting the env variable ANEMOI_CONFIG_DATASETS_IGNORE_NAMING_CONVENTIONS=1
66+
# will ignore the naming conventions
67+
return
68+
69+
self.check_characters()
6170
self.check_parsed()
6271
self.check_resolution(resolution)
6372
self.check_frequency(frequency)
@@ -157,6 +166,15 @@ def check_resolution(self, resolution: Optional[str]) -> None:
157166
self._check_missing("resolution", resolution_str)
158167
self._check_mismatch("resolution", resolution_str)
159168

169+
def check_characters(self) -> None:
170+
if not self.name.islower():
171+
self.messages.append(f"the {self.name} should be in lower case.")
172+
if "_" in self.name:
173+
self.messages.append(f"the {self.name} should use '-' instead of '_'.")
174+
for c in self.name:
175+
if not c.isalnum() and c not in "-":
176+
self.messages.append(f"the {self.name} should only contain alphanumeric characters and '-'.")
177+
160178
def check_frequency(self, frequency: Optional[datetime.timedelta]) -> None:
161179
"""Check if the frequency matches the expected format.
162180

0 commit comments

Comments
 (0)