MeteoSwiss
diff --git a/‎config/streams/streams_anemoi/era5.yml
Lines changed: 2 additions & 2 deletions b/‎config/streams/streams_anemoi/era5.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎config/streams/streams_ocean/fesom.yml
Lines changed: 35 additions & 0 deletions b/‎config/streams/streams_ocean/fesom.yml
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/weathergen/datasets/anemoi_dataset.py
Lines changed: 157 additions & 23 deletions b/‎src/weathergen/datasets/anemoi_dataset.py
Lines changed: 157 additions & 23 deletions
diff --git a/‎src/weathergen/datasets/atmorep_dataset.py
Lines changed: 89 additions & 0 deletions b/‎src/weathergen/datasets/atmorep_dataset.py
Lines changed: 89 additions & 0 deletions
@@ -10,9 +10,9 @@
 ERA5 :
   type : anemoi
   filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2022-6h-v6.zarr']
+  # source : ['u_', 'v_', '10u', '10v']
+  # target : ['10u', '10v']
   loss_weight : 1.
-  source_variables : [null]
-  target_variables : [null]
   diagnostic : False
   masking_rate : 0.6
   masking_rate_none : 0.05
 
@@ -0,0 +1,35 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+FESOM :
+  type : fesom
+  filenames : ['coupled_yearly']
+  loss_weight : 1.
+  source : null
+  target : ['sst']
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 64
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 2
+    dim_embed : 256
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 256
+  target_readout :
+    type : 'obs_value'  # token or obs_value
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
@@ -18,14 +18,15 @@ class AnemoiDataset:
 
     def __init__(
         self,
-        filename: str,
         start: int,
         end: int,
         len_hrs: int,
-        step_hrs: int = None,
-        normalize: bool = True,
-        select: list[str] = None,
+        step_hrs: int,
+        filename: str,
+        stream_info: dict,
     ) -> None:
+        # TODO: add support for different normalization modes
+
         assert len_hrs == step_hrs, "Currently only step_hrs=len_hrs is supported"
 
         # open  dataset to peak that it is compatible with requested parameters
@@ -40,30 +41,61 @@ def __init__(
         dt_start = datetime.datetime.strptime(str(start), format_str)
         dt_end = datetime.datetime.strptime(str(end), format_str)
 
+        # TODO, TODO, TODO: we need proper alignment for the case where self.ds.frequency
+        # is not a multile of len_hrs
+        self.num_steps_per_window = int((len_hrs * 3600) / self.ds.frequency.seconds)
+
         # open dataset
 
         # caches lats and lons
         self.latitudes = self.ds.latitudes.astype(np.float32)
         self.longitudes = self.ds.longitudes.astype(np.float32)
 
-        # find physical fields (i.e. filter out auxiliary information to facilitate prediction)
-        self.fields_idx = np.sort(
+        # TODO: define in base class
+        self.geoinfo_idx = []
+
+        # Determine source and target channels, filtering out forcings etc and using
+        # specified source and target channels if specified
+        source_channels = stream_info["source"] if "source" in stream_info else None
+        self.source_idx = np.sort(
             [
                 self.ds.name_to_index[k]
                 for i, (k, v) in enumerate(self.ds.typed_variables.items())
-                if not v.is_computed_forcing and not v.is_constant_in_time
+                if (
+                    not v.is_computed_forcing
+                    and not v.is_constant_in_time
+                    and (
+                        np.array([f in k for f in source_channels]).any()
+                        if source_channels
+                        else True
+                    )
+                )
             ]
         )
-        # TODO: use complement of self.fields_idx as geoinfo
-        self.fields = [self.ds.variables[i] for i in self.fields_idx]
-        self.colnames = ["lat", "lon"] + self.fields
-        self.selected_colnames = self.colnames
+        target_channels = stream_info["target"] if "target" in stream_info else None
+        self.target_idx = np.sort(
+            [
+                self.ds.name_to_index[k]
+                for i, (k, v) in enumerate(self.ds.typed_variables.items())
+                if (
+                    not v.is_computed_forcing
+                    and not v.is_constant_in_time
+                    and (
+                        np.array([f in k for f in target_channels]).any()
+                        if target_channels
+                        else True
+                    )
+                )
+            ]
+        )
+        self.source_channels = [self.ds.variables[i] for i in self.source_idx]
+        self.target_channels = [self.ds.variables[i] for i in self.target_idx]
 
         self.properties = {
-            "obs_id": 0,
-            "means": self.ds.statistics["mean"],
-            "vars": np.square(self.ds.statistics["stdev"]),
+            "stream_id": 0,
         }
+        self.mean = self.ds.statistics["mean"]
+        self.stdev = self.ds.statistics["stdev"]
 
         # set dataset to None when no overlap with time range
         if dt_start >= ds_dt_end or dt_end <= ds_dt_start:
@@ -80,26 +112,128 @@ def __len__(self):
 
         return len(self.ds)
 
-    def __getitem__(self, idx: int) -> tuple:
-        "Get (data,datetime) for given index"
+    def get_source(self, idx: int) -> tuple[np.array, np.array, np.array, np.array]:
+        """
+        TODO
+        """
+        return self._get(idx, self.source_idx)
+
+    def get_target(self, idx: int) -> tuple[np.array, np.array, np.array, np.array]:
+        """
+        TODO
+        """
+        return self._get(idx, self.target_idx)
+
+    def _get(
+        self, idx: int, channels_idx: np.array
+    ) -> tuple[np.array, np.array, np.array, np.array]:
+        """
+        TODO
+        """
 
         if not self.ds:
-            return (np.array([], dtype=np.float32), np.array([], dtype=np.float32))
+            return (
+                np.array([], dtype=np.float32),
+                np.array([], dtype=np.float32),
+                np.array([], dtype=np.float32),
+                np.array([], dtype=np.float32),
+            )
+
+        # extract number of time steps and collapse ensemble dimension
+        data = self.ds[idx : idx + self.num_steps_per_window][:, :, 0]
+        # extract channels
+        data = (
+            data[:, channels_idx].transpose([0, 2, 1]).reshape((data.shape[0] * data.shape[2], -1))
+        )
 
-        # prepend lat and lon to data; squeeze out ensemble dimension (for the moment)
-        data = np.concatenate(
+        # construct lat/lon coords
+        latlon = np.concatenate(
             [
                 np.expand_dims(self.latitudes, 0),
                 np.expand_dims(self.longitudes, 0),
-                self.ds[idx].squeeze(),
             ],
             0,
         ).transpose()
+        latlon = np.repeat(latlon, self.num_steps_per_window, axis=0).reshape((-1, latlon.shape[1]))
 
-        # date time matching #data points of data
-        datetimes = np.full(data.shape[0], self.ds.dates[idx])
+        # empty geoinfos for anemoi
+        geoinfos = np.zeros((data.shape[0], 0), dtype=data.dtype)
 
-        return (data, datetimes)
+        # date time matching #data points of data
+        datetimes = np.repeat(
+            np.expand_dims(self.ds.dates[idx : idx + self.num_steps_per_window], 0),
+            data.shape[0],
+            axis=0,
+        ).flatten()
+
+        return (latlon, geoinfos, data, datetimes)
+
+    def get_source_size(self):
+        """
+        TODO
+        """
+        return 2 + len(self.geoinfo_idx) + len(self.source_idx)
+
+    def get_source_num_channels(self):
+        """
+        TODO
+        """
+        return len(self.source_idx)
+
+    def get_target_size(self):
+        """
+        TODO
+        """
+        return 2 + len(self.geoinfo_idx) + len(self.target_idx)
+
+    def get_target_num_channels(self):
+        """
+        TODO
+        """
+        return len(self.target_idx)
+
+    def get_geoinfo_size(self):
+        """
+        TODO
+        """
+        return len(self.geoinfo_idx)
+
+    def normalize_coords(self, coords):
+        """
+        TODO
+        """
+        coords[..., 0] = np.sin(np.deg2rad(coords[..., 0]))
+        coords[..., 1] = np.sin(0.5 * np.deg2rad(coords[..., 1]))
+
+        return coords
+
+    def normalize_geoinfos(self, geoinfos):
+        """
+        TODO
+        """
+
+        assert geoinfos.shape[-1] == 0
+        return geoinfos
+
+    def normalize_source_channels(self, source):
+        """
+        TODO
+        """
+        assert source.shape[1] == len(self.source_idx)
+        for i, ch in enumerate(self.source_idx):
+            source[..., i] = (source[..., i] - self.mean[ch]) / self.stdev[ch]
+
+        return source
+
+    def normalize_target_channels(self, target):
+        """
+        TODO
+        """
+        assert target.shape[1] == len(self.target_idx)
+        for i, ch in enumerate(self.target_idx):
+            target[..., i] = (target[..., i] - self.mean[ch]) / self.stdev[ch]
+
+        return target
 
     def time_window(self, idx: int) -> tuple[np.datetime64, np.datetime64]:
         if not self.ds:
 
@@ -0,0 +1,89 @@
+from datetime import datetime
+
+import numpy as np
+import zarr
+
+
+class AtmorepDataset:
+    def __init__(
+        self,
+        filename: str,
+        start: datetime | int,
+        end: datetime | int,
+        len_hrs: int,
+        step_hrs: int | None = None,
+        normalize: bool = True,
+        select: list[str] | None = None,
+    ):
+        format_str = "%Y%m%d%H%M%S"
+        if type(start) is int:
+            start = datetime.strptime(str(start), format_str)
+
+        if type(end) is int:
+            end = datetime.strptime(str(end), format_str)
+
+        self.normalize = normalize
+        self.filename = filename
+        self.z = zarr.open(filename, mode="r")
+
+        self.lats, self.lons = np.meshgrid(np.array(self.z["lats"]), np.array(self.z["lons"]))
+        self.lats = self.lats.flatten()
+        self.lons = self.lons.flatten()
+        # Reshape lats and lons to be in shape (1, len_hrs, size_lat * size_lon), ready to added to data
+        self.lats = np.expand_dims(np.stack((self.lats,) * len_hrs, axis=1).T, 0)
+        self.lons = np.expand_dims(np.stack((self.lons,) * len_hrs, axis=1).T, 0)
+
+        self.time = np.array(self.z["time"], dtype=np.datetime64)
+        self.start_idx = np.searchsorted(self.time, start)
+        self.end_idx = np.searchsorted(self.time, end)
+
+        assert self.end_idx > self.start_idx, (
+            f"Abort: Final index of {self.end_idx} is the same of larger than start index {self.start_idx}"
+        )
+
+        self.colnames = ["lat", "lon"] + list(self.z.attrs["fields"])
+        self.len_hrs = len_hrs
+        # Ignore step_hrs, idk how it supposed to work
+        self.step_hrs = 1
+
+        self.selected_colnames = self.colnames[2:]
+        self.selected_cols_idx = np.arange(len(self.selected_colnames))
+        self.data = self.z["data"]
+
+        self.properties = {
+            "obs_id": 0,
+            "means": np.zeros(len(self.colnames), dtype=np.float32),
+            "vars": np.ones(len(self.colnames), dtype=np.float32),
+        }
+
+        if select:
+            self.select(select)
+
+    def select(self, cols_list: list[str]) -> None:
+        """
+        Allow user to specify which columns they want to access.
+        Get functions only returned for these specified columns.
+        """
+        self.selected_colnames = cols_list
+        self.selected_cols_idx = np.array([self.colnames.index(item) for item in cols_list])
+
+    def __len__(self):
+        return self.end_idx - self.start_idx - self.len_hrs
+
+    def __getitem__(self, idx: int) -> tuple:
+        start_row = self.start_idx + idx
+        end_row = start_row + self.len_hrs
+
+        data = self.data.oindex[start_row:end_row, :, 0, :, :]
+        datetimes = np.tile(self.time[start_row:end_row], data.shape[-1] * data.shape[-2])
+
+        data = np.reshape(data, (data.shape[1], data.shape[0], -1))
+        data = np.concatenate([self.lats, self.lons, data], 0).T
+        data = np.reshape(data, (-1, data.shape[-1]))
+
+        return (data.astype(np.float32), datetimes)
+
+    def time_window(self, idx: int) -> tuple[np.datetime64, np.datetime64]:
+        start_row = self.start_idx + idx
+        end_row = start_row + self.len_hrs
+        return (self.time[start_row], self.time[end_row])