Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
50fcf7c
cleaning up tensorboard
anaprietonem Oct 21, 2025
60a7616
add test for w&b and better handling of dependencies
anaprietonem Oct 22, 2025
ee61e18
simplify logic
anaprietonem Oct 22, 2025
f808e00
cleaning up with Florian changes and harmonising with w&b
anaprietonem Oct 23, 2025
c5b9fcc
wip
anaprietonem Oct 23, 2025
d37726f
update
anaprietonem Oct 24, 2025
32304a5
Merge branch 'main' into deprecate_tensorboard_and_clean_loggers
anaprietonem Oct 24, 2025
af1cf4f
continue deprecation of tensorboard
anaprietonem Oct 24, 2025
51ebdf4
test for wandb logger
anaprietonem Oct 24, 2025
e75c472
Merge branch 'main' into deprecate_tensorboard_and_clean_loggers
anaprietonem Nov 20, 2025
acfb7a9
update and harmonise
anaprietonem Jan 30, 2026
3d3f69c
Merge branch 'main' into deprecate_tensorboard_and_clean_loggers
anaprietonem Jan 30, 2026
5522aea
update test
anaprietonem Jan 30, 2026
13cb9ba
update
anaprietonem Jan 30, 2026
c1b003d
Merge branch 'main' into deprecate_tensorboard_and_clean_loggers
anaprietonem Jan 30, 2026
4ba6d3a
dummy typo
anaprietonem Jan 30, 2026
0ae89c4
fixes for tests
anaprietonem Jan 30, 2026
c864dbf
fixes for tests
anaprietonem Jan 30, 2026
127a1f4
Merge branch 'main' into deprecate_tensorboard_and_clean_loggers
anaprietonem Jan 30, 2026
bed6fac
fix tests
anaprietonem Jan 30, 2026
59d5df9
undo
anaprietonem Jan 30, 2026
4012227
fix
anaprietonem Jan 30, 2026
aa86c57
pass target for w&b
anaprietonem Jan 30, 2026
b76d0b7
clean up convert to omegaconf
anaprietonem Jan 30, 2026
ec66ecc
revisit test
anaprietonem Jan 30, 2026
d5447f2
fix integration tests
anaprietonem Feb 1, 2026
10f6eae
bring back attribute
anaprietonem Feb 1, 2026
683f21e
fixes for defining run_id
anaprietonem Feb 2, 2026
dd79dba
fix typo
anaprietonem Feb 2, 2026
608a8b8
simplify
anaprietonem Feb 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions training/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ optional-dependencies.dev = [
"anemoi-training[all,docs,tests]",
"pre-commit>=3.3.3",
]

optional-dependencies.docs = [
"autodoc-pydantic",
"nbsphinx",
Expand All @@ -78,6 +79,12 @@ optional-dependencies.docs = [
"sphinx-argparse",
"sphinx-rtd-theme",
]
# Optional Dependencies
optional-dependencies.mlflow = [
"anemoi-utils[mlflow]>=0.4.37",
"mlflow-skinny>=2.11.1",
]

optional-dependencies.plotting = [
"distinctipy>=1",
"pyshtools>=4.13",
Expand All @@ -89,14 +96,18 @@ optional-dependencies.profile = [
"tabulate>=0.9",
]
optional-dependencies.tests = [
"anemoi-graphs[tri]", # required to load the gnn checkpoint as part of the tests
"anemoi-training[azure]", # required to test the AzureMlflowLogger
"anemoi-graphs[tri]", # required to load the gnn checkpoint as part of the tests
"anemoi-training[azure,mlflow,wandb]",
"hypothesis>=6.11",
"pytest>=8",
"pytest-mock>=3",
"pytest-skip-slow>=0.0.5",
"sshfs",
]
optional-dependencies.wandb = [
"wandb>=0.22.2",
]

urls.Changelog = "https://github.com/ecmwf/anemoi-training/CHANGELOG.md"
urls.Documentation = "https://anemoi-training.readthedocs.io/"
urls.Homepage = "https://github.com/ecmwf/anemoi-training/"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ checkpoint:
log:
wandb:
enabled: False
_target_: pytorch_lightning.loggers.wandb.WandbLogger
offline: False
log_model: False
project: 'Anemoi'
entity: example
# logger options (these probably come with some overhead)
gradients: False
parameters: False
tensorboard:
enabled: False
interval: ${diagnostics.log.interval}
mlflow:
enabled: False
_target_: anemoi.training.diagnostics.mlflow.logger.AnemoiMLflowLogger
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ checkpoint:
log:
wandb:
enabled: False
_target_: pytorch_lightning.loggers.wandb.WandbLogger
offline: False
log_model: False
project: 'Anemoi'
entity: ???
# logger options (these probably come with some overhead)
gradients: False
parameters: False
tensorboard:
enabled: False
interval: ${diagnostics.log.interval}
mlflow:
_target_: anemoi.training.diagnostics.mlflow.logger.AnemoiMLflowLogger
enabled: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ checkpoint:
log:
wandb:
enabled: False
_target_: pytorch_lightning.loggers.wandb.WandbLogger
offline: False
log_model: False
project: 'Anemoi'
entity: example
# logger options (these probably come with some overhead)
gradients: False
parameters: False
tensorboard:
enabled: False
interval: ${diagnostics.log.interval}
mlflow:
_target_: anemoi.training.diagnostics.mlflow.logger.AnemoiMLflowLogger
enabled: False
Expand Down
112 changes: 53 additions & 59 deletions training/src/anemoi/training/diagnostics/logger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# (C) Copyright 2024 Anemoi contributors.
#

# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
#
Expand All @@ -15,68 +15,51 @@
from omegaconf import DictConfig
from omegaconf import OmegaConf

from anemoi.training.schemas.base_schema import BaseSchema
from anemoi.training.schemas.base_schema import convert_to_omegaconf

LOGGER = logging.getLogger(__name__)


def get_mlflow_logger(config: BaseSchema) -> None:
if not config.diagnostics.log.mlflow.enabled:
def get_mlflow_logger(
run_id: str,
fork_run_id: str,
paths: DictConfig,
logger_config: DictConfig,
**kwargs,
) -> None:
del kwargs
mlflow_config = logger_config.mlflow
if not mlflow_config.enabled:
LOGGER.debug("MLFlow logging is disabled.")
return None

logger_config = OmegaConf.to_container(convert_to_omegaconf(config).diagnostics.log.mlflow)
del logger_config["enabled"]
mlflow_config = OmegaConf.to_container(mlflow_config)
del mlflow_config["enabled"]

# backward compatibility to not break configs
logger_config["_target_"] = logger_config.get(
mlflow_config["_target_"] = mlflow_config.get(
"_target_",
"anemoi.training.diagnostics.mlflow.logger.AnemoiMLflowLogger",
)
logger_config["save_dir"] = logger_config.get("save_dir", str(config.system.output.logs.mlflow))

mlflow_config["save_dir"] = mlflow_config.get("save_dir", str(paths.logs.mlflow))
logger = instantiate(
logger_config,
run_id=config.training.run_id,
fork_run_id=config.training.fork_run_id,
mlflow_config,
run_id=run_id,
fork_run_id=fork_run_id,
)

if logger.log_terminal:
logger.log_terminal_output(artifact_save_dir=config.system.output.plots)
logger.log_terminal_output(artifact_save_dir=paths.plots)
if logger.log_system:
logger.log_system_metrics()

return logger


def get_tensorboard_logger(config: DictConfig) -> pl.loggers.TensorBoardLogger | None:
"""Setup TensorBoard experiment logger.
Parameters
----------
config : DictConfig
Job configuration
Returns
-------
pl.loggers.TensorBoardLogger | None
Logger object, or None
"""
if not config.diagnostics.log.tensorboard.enabled:
LOGGER.debug("Tensorboard logging is disabled.")
return None

from pytorch_lightning.loggers import TensorBoardLogger

return TensorBoardLogger(
save_dir=config.system.output.logs.tensorboard,
log_graph=False,
)


def get_wandb_logger(config: DictConfig, model: pl.LightningModule) -> pl.loggers.WandbLogger | None:
def get_wandb_logger(
run_id: str,
paths: DictConfig,
model: pl.LightningModule,
logger_config: DictConfig,
**kwargs,
) -> pl.loggers.WandbLogger | None:
"""Setup Weights & Biases experiment logger.
Parameters
Expand All @@ -97,33 +80,44 @@ def get_wandb_logger(config: DictConfig, model: pl.LightningModule) -> pl.logger
If `wandb` is not installed
"""
if not config.diagnostics.log.wandb.enabled:
del kwargs

save_dir = paths.logs.wandb
wandb_config = logger_config.wandb
gradients = wandb_config.gradients
parameters = wandb_config.parameters

# backward compatibility to not break configs
interval = getattr(wandb_config, "interval", 100)

if not wandb_config.enabled:
LOGGER.debug("Weights & Biases logging is disabled.")
return None

wandb_config = OmegaConf.to_container(wandb_config)
del wandb_config["gradients"]
del wandb_config["parameters"]
del wandb_config["enabled"]
del wandb_config["interval"]

try:
from pytorch_lightning.loggers.wandb import WandbLogger
logger = instantiate(
wandb_config,
id=run_id,
save_dir=save_dir,
resume=run_id is not None,
)
except ImportError as err:
msg = "To activate W&B logging, please install `wandb` as an optional dependency."
raise ImportError(msg) from err

logger = WandbLogger(
project=config.diagnostics.log.wandb.project,
entity=config.diagnostics.log.wandb.entity,
id=config.training.run_id,
save_dir=config.system.output.logs.wandb,
offline=config.diagnostics.log.wandb.offline,
log_model=config.diagnostics.log.wandb.log_model,
resume=config.training.run_id is not None,
)
logger.log_hyperparams(OmegaConf.to_container(config, resolve=True))
if config.diagnostics.log.wandb.gradients or config.diagnostics.log.wandb.parameters:
if config.diagnostics.log.wandb.gradients and config.diagnostics.log.wandb.parameters:
if gradients or parameters:
if gradients and parameters:
log_ = "all"
elif config.diagnostics.log.wandb.gradients:
elif gradients:
log_ = "gradients"
else:
log_ = "parameters"
logger.watch(model, log=log_, log_freq=config.diagnostics.log.interval, log_graph=False)
logger.watch(model, log=log_, log_freq=interval, log_graph=False)

return logger
22 changes: 15 additions & 7 deletions training/src/anemoi/training/diagnostics/mlflow/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from weakref import WeakValueDictionary

import mlflow
from mlflow.exceptions import RestException
from mlflow.tracking import MlflowClient
from omegaconf import DictConfig
from pytorch_lightning.callbacks import Checkpoint
Expand Down Expand Up @@ -321,15 +322,14 @@ def __init__(
self._flag_log_hparams = log_hyperparams
if self._resumed and not on_resume_create_child:
LOGGER.info(
(
"Resuming run without creating child run - MLFlow logs will not update the"
"initial runs hyperparameters with those of the resumed run."
"To update the initial run's hyperparameters, set "
"`diagnostics.log.mlflow.on_resume_create_child: True`."
),
"Resuming run without creating child run - MLFlow logs will not update the"
"initial runs hyperparameters with those of the resumed run."
"To update the initial run's hyperparameters, set "
"`diagnostics.log.mlflow.on_resume_create_child: True`.",
)
self._flag_log_hparams = False

# initialize server2server lineage attributes
self._fork_run_server2server = None
self._parent_run_server2server = None
self._parent_dry_run = False
Expand Down Expand Up @@ -516,7 +516,15 @@ def log_metrics(self, metrics: Mapping[str, float], step: int | None = None) ->
cleaned_metrics.pop(k)
continue
self._logged_metrics.add(metric_id)
return super().log_metrics(metrics=cleaned_metrics, step=step)
try:
return super().log_metrics(metrics=cleaned_metrics, step=step)
except RestException as e:
# Handle duplicate metric key issue gracefully
if "duplicate key value violates unique constraint" in str(e):
LOGGER.warning("Duplicate metric detected %s", e)
else:
# Re-raise if it's a different kind of error
raise

@rank_zero_only
def log_system_metrics(self) -> None:
Expand Down
3 changes: 0 additions & 3 deletions training/src/anemoi/training/diagnostics/profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,6 @@ def get_system_profiler_df(self, logger_name: str, logger: pl.loggers.Logger) ->
system_metrics_df = self.to_df(WandBSystemSummarizer(logger).summarize_system_metrics())
elif logger_name == "mlflow":
system_metrics_df = MLFlowSystemSummarizer(logger).summarize_mlflow_system_metrics()
elif logger_name == "tensorboard":
LOGGER.info("No system profiler data available for Tensorboard")
system_metrics_df = None

self.system_report_fname = self.dirpath / "system_profiler.csv"
self._save_report(system_metrics_df, self.system_report_fname)
Expand Down
4 changes: 0 additions & 4 deletions training/src/anemoi/training/schemas/base_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,6 @@ def expand_paths(config_system: Union[SystemSchema, DictConfig]) -> Union[System
output_config.logs.mlflow = (
base / "mlflow" if output_config.logs.mlflow is None else base / output_config.logs.mlflow
)
output_config.logs.tensorboard = (
base / "tensorboard" if output_config.logs.tensorboard is None else base / output_config.logs.tensorboard
)

# CheckPointSchema
output_config.checkpoints.root = (
root_output_path / output_config.checkpoints.root if output_config.checkpoints.root else root_output_path
Expand Down
Loading
Loading