Simplify and speed up open_mfdataset

bendudson · bendudson · commit f925dba5c8a9 · 2024-10-29T14:14:33.000-07:00
Xarray's open_mfdataset is very slow for BOUT++ datasets. Other datasets also have issues (see e.g. pydata/xarray#1385) though the cause may not be the same. Using an implementation that opens the datasets and concatenates significantly speeds up this process.
diff --git a/xbout/load.py b/xbout/load.py
@@ -19,22 +19,6 @@
     _is_dir,
 )
 
-
-_BOUT_PER_PROC_VARIABLES = [
-    "wall_time",
-    "wtime",
-    "wtime_rhs",
-    "wtime_invert",
-    "wtime_comms",
-    "wtime_io",
-    "wtime_per_rhs",
-    "wtime_per_rhs_e",
-    "wtime_per_rhs_i",
-    "PE_XIND",
-    "PE_YIND",
-    "MYPE",
-]
-_BOUT_TIME_DEPENDENT_META_VARS = ["iteration", "hist_hi", "tt"]
 _BOUT_GEOMETRY_VARS = [
     "ixseps1",
     "ixseps2",
@@ -69,9 +53,6 @@
     )
 
 
-# TODO somehow check that we have access to the latest version of auto_combine
-
-
 def open_boutdataset(
     datapath="./BOUT.dmp.*.nc",
     inputfilepath=None,
@@ -295,15 +276,6 @@ def attrs_remove_section(obj, section):
     else:
         raise ValueError(f"internal error: unexpected input_type={input_type}")
 
-    if not is_restart:
-        for var in _BOUT_TIME_DEPENDENT_META_VARS:
-            if var in ds:
-                # Assume different processors in x & y have same iteration etc.
-                latest_top_left = {dim: 0 for dim in ds[var].dims}
-                if "t" in ds[var].dims:
-                    latest_top_left["t"] = -1
-                ds[var] = ds[var].isel(latest_top_left).squeeze(drop=True)
-
     ds, metadata = _separate_metadata(ds)
     # Store as ints because netCDF doesn't support bools, so we can't save
     # bool attributes
@@ -616,11 +588,6 @@ def _auto_open_mfboutdataset(
     if chunks is None:
         chunks = {}
 
-    if is_restart:
-        data_vars = "minimal"
-    else:
-        data_vars = _BOUT_TIME_DEPENDENT_META_VARS
-
     if _is_path(datapath):
         filepaths, filetype = _expand_filepaths(datapath)
 
@@ -640,6 +607,9 @@ def _auto_open_mfboutdataset(
         else:
             remove_yboundaries = False
 
+        # Create a partial application of _trim
+        # Calls to _preprocess will call _trim to trim guard / boundary cells
+        # from datasets before merging.
         _preprocess = partial(
             _trim,
             guards={"x": mxg, "y": myg},
@@ -651,40 +621,11 @@ def _auto_open_mfboutdataset(
 
         paths_grid, concat_dims = _arrange_for_concatenation(filepaths, nxpe, nype)
 
-        try:
-            ds = xr.open_mfdataset(
-                paths_grid,
-                concat_dim=concat_dims,
-                combine="nested",
-                data_vars=data_vars,
-                preprocess=_preprocess,
-                engine=filetype,
-                chunks=chunks,
-                join="exact",
-                **kwargs,
-            )
-        except ValueError as e:
-            message_to_catch = (
-                "some variables in data_vars are not data variables on the first "
-                "dataset:"
-            )
-            if str(e)[: len(message_to_catch)] == message_to_catch:
-                # Open concatenating any variables that are different in
-                # different files as a work around to support opening older
-                # data.
-                ds = xr.open_mfdataset(
-                    paths_grid,
-                    concat_dim=concat_dims,
-                    combine="nested",
-                    data_vars="different",
-                    preprocess=_preprocess,
-                    engine=filetype,
-                    chunks=chunks,
-                    join="exact",
-                    **kwargs,
-                )
-            else:
-                raise
+        # Call custom implementation of open_mfdataset
+        # avoiding some of the performance issues.
+        from .mfdataset import mfdataset
+
+        ds = mfdataset(paths_grid, concat_dim=concat_dims, preprocess=_preprocess)
     else:
         # datapath was nested list of Datasets
 
@@ -731,11 +672,6 @@ def _auto_open_mfboutdataset(
             combine_attrs="no_conflicts",
         )
 
-    if not is_restart:
-        # Remove any duplicate time values from concatenation
-        _, unique_indices = unique(ds["t_array"], return_index=True)
-        ds = ds.isel(t=unique_indices)
-
     return ds, remove_yboundaries
 
 
@@ -933,8 +869,10 @@ def _trim(ds, *, guards, keep_boundaries, nxpe, nype, is_restart):
     """
     Trims all guard (and optionally boundary) cells off a single dataset read from a
     single BOUT dump file, to prepare for concatenation.
-    Also drops some variables that store timing information, which are different for each
-    process and so cannot be concatenated.
+
+    Variables that store timing information, which are different for each
+    process, are not trimmed but are taken from the first processor during
+    concatenation.
 
     Parameters
     ----------
@@ -973,9 +911,7 @@ def _trim(ds, *, guards, keep_boundaries, nxpe, nype, is_restart):
         ):
             trimmed_ds = trimmed_ds.drop_vars(name)
 
-    to_drop = _BOUT_PER_PROC_VARIABLES
-
-    return trimmed_ds.drop_vars(to_drop, errors="ignore")
+    return trimmed_ds
 
 
 def _infer_contains_boundaries(ds, nxpe, nype):
diff --git a/xbout/mfdataset.py b/xbout/mfdataset.py
@@ -0,0 +1,84 @@
+# Custom implementation of xarray.open_mfdataset()
+
+import xarray as xr
+
+
+def concat_outer(dss, operation=None):
+    """
+    Concatenate nested lists along their outer dimension
+
+    # Example
+
+    >>> m = [[1,2,3],
+             [2,3,3],
+             [5,4,3]]
+
+    >>> concat_outer(m)
+
+    [[1, 2, 5], [2, 3, 4], [3, 3, 3]]
+
+    >>> concat_outer(m, operation=sum)
+
+    [8, 9, 9]
+    
+    """
+    if not isinstance(dss[0], list):
+        # Input is a 1D list
+        if operation is not None:
+            return operation(dss)
+        return dss
+
+    # Two or more dimensions
+    # Swap first and second indices then concatenate inner
+    if len(dss[0]) == 1:
+        return concat_outer([dss[j][0] for j in range(len(dss))], operation=operation)
+
+    return [
+        concat_outer([dss[j][i] for j in range(len(dss))], operation=operation)
+        for i in range(len(dss[0]))
+    ]
+
+
+def mfdataset(paths, chunks=None, concat_dim=None, preprocess=None):
+    if chunks is None:
+        chunks = {}
+
+    if not isinstance(concat_dim, list):
+        concat_dim = [concat_dim]
+
+    if isinstance(paths, list):
+        # Read nested dataset
+
+        dss = [
+            mfdataset(
+                path, chunks=chunks, concat_dim=concat_dim[1:], preprocess=preprocess
+            )
+            for path in paths
+        ]
+
+        # The dimension to concatenate along
+        if concat_dim[0] is None:
+            # Not concatenating
+            if len(dss) == 1:
+                return dss[0]
+            return dss
+
+        # Concatenating along the top-level dimension
+        return concat_outer(
+            dss,
+            operation=lambda ds: xr.concat(
+                ds,
+                concat_dim[0],
+                data_vars="minimal",  # Only data variables in which the dimension already appears are concatenated.
+                coords="minimal",  # Only coordinates in which the dimension already appears are concatenated.
+                compat="override",  # Duplicate data taken from first dataset
+                join="exact",  # Don't align. Raise ValueError when indexes to be aligned are not equal
+                combine_attrs="override",  # Duplicate attributes taken from first dataset
+                create_index_for_new_dim=False,
+            ),
+        )
+    # A single path
+    ds = xr.open_dataset(paths, chunks=chunks)
+    if preprocess is not None:
+        ds = preprocess(ds)
+    return ds