Clessig/develop/embedding1 (ecmwf#227)

clessig · web-flow · commit 14fe74ae43cb · 2025-05-08T22:20:54.000+02:00
* - Removed some auxiliary comments that should not have been there
- Simplified code by removing dead branch

* Fixed bug in source_size()

* Fixed problem with columns embedding mode.
diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -179,7 +179,10 @@ def advance(self):
     ###################################################
     def get_sources_size(self):
         return [
-            ds[0].get_source_num_channels() + ds[0].get_geoinfo_size() + ds[0].get_coords_size()
+            ds[0].get_source_num_channels()
+            + ds[0].get_geoinfo_size()
+            + ds[0].get_coords_size()
+            + self.tokenizer.get_size_time_embedding()
             for ds in self.streams_datasets
         ]
 
diff --git a/src/weathergen/datasets/tokenizer_forecast.py b/src/weathergen/datasets/tokenizer_forecast.py
@@ -123,6 +123,12 @@ def __init__(self, healpix_level: int):
 
         self.rng = np.random.default_rng(int(time.time()))
 
+        self.size_time_embedding = 6
+
+    def get_size_time_embedding(self) -> int:
+        """Get size of time embedding"""
+        return self.size_time_embedding
+
     def reset(self) -> None:
         self.rng = np.random.default_rng(int(time.time()))
 
diff --git a/src/weathergen/datasets/tokenizer_masking.py b/src/weathergen/datasets/tokenizer_masking.py
@@ -123,6 +123,12 @@ def __init__(self, healpix_level: int):
 
         self.rng = np.random.default_rng(int(time.time()))
 
+        self.size_time_embedding = 6
+
+    def get_size_time_embedding(self) -> int:
+        """Get size of time embedding"""
+        return self.size_time_embedding
+
     def reset(self) -> None:
         self.rng = np.random.default_rng(int(time.time()))
 
diff --git a/src/weathergen/model/embeddings.py b/src/weathergen/model/embeddings.py
@@ -7,7 +7,7 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-
+import numpy as np
 import torch
 from torch.utils.checkpoint import checkpoint
 
@@ -45,6 +45,7 @@ def __init__(
         super(StreamEmbedTransformer, self).__init__()
 
         self.num_tokens = num_tokens
+        self.token_size = token_size
         self.num_channels = num_channels
         self.dim_in = token_size if mode == "channels" else num_channels
         self.dim_embed = dim_embed
@@ -56,8 +57,6 @@ def __init__(
 
         norm = torch.nn.LayerNorm if norm_type == "LayerNorm" else RMSNorm
 
-        self.embed = torch.nn.Linear(self.dim_in, self.dim_embed)
-
         self.layers = torch.nn.ModuleList()
         for _ in range(self.num_blocks):
             self.layers.append(
@@ -80,6 +79,8 @@ def __init__(
             )
 
         if mode == "channels":
+            self.embed = torch.nn.Linear(self.dim_in, self.dim_embed)
+
             if self.unembed_mode == "full":
                 self.ln_final = norm(num_channels * self.dim_embed)
                 self.unembed = torch.nn.Linear(
@@ -94,6 +95,11 @@ def __init__(
                 dim_out = (self.num_tokens * self.dim_out - embed_size_centroids) // num_channels
                 self.unembed = torch.nn.ModuleList(
                     [torch.nn.Linear(dim_embed, dim_out) for _ in range(num_channels)]
+                    # [
+                    #     torch.nn.Sequential(torch.nn.Linear(dim_embed, max(dim_embed//2,4*dim_out)),
+                    #     torch.nn.GELU(),
+                    #     torch.nn.Linear(max(dim_embed//2,4*dim_out), dim_out)) for _ in range(num_channels)
+                    # ]
                 )
                 self.ln_final = torch.nn.ModuleList([norm(dim_embed) for _ in range(num_channels)])
 
@@ -103,9 +109,12 @@ def __init__(
             self.forward = self.forward_channels
 
         elif mode == "columns":
+            assert embed_size_centroids == 0
+            self.embed = torch.nn.Linear(self.dim_in, self.dim_embed)
+
             assert self.unembed_mode == "block"  # only supported mode at the moment
             # padding needed if the unembedded columns cannot be concatenated to dim_out (e.g GPSRO)
-            self.pad = (self.dim_out - embed_size_centroids) % token_size
+            self.pad = self.dim_out % token_size
             self.out_pad = torch.nn.Parameter(torch.zeros(self.pad))
             self.unembed = torch.nn.Linear(
                 self.dim_embed,
@@ -114,6 +123,13 @@ def __init__(
             self.ln_final = norm(dim_out)
             self.forward = self.forward_columns
 
+            # TODO: factorization when sqrt is not int
+            dim1 = int(np.sqrt(dim_out))
+            assert dim1 * dim1 == dim_out
+            self.unembed1 = torch.nn.Linear(self.dim_embed, dim1)
+            self.unembed_nonlin = torch.nn.GELU()
+            self.unembed2 = torch.nn.Linear(self.token_size, dim1)
+
         else:
             assert False
 
@@ -135,7 +151,7 @@ def forward_channels(self, x_in, centroids):
         elif self.unembed_mode == "block":
             out = [
                 checkpoint(ue, ln(x[:, i]), use_reentrant=False)
-                for i, (ue, ln) in enumerate(zip(self.unembed, self.ln_final, strict=False))
+                for i, (ue, ln) in enumerate(zip(self.unembed, self.ln_final, strict=True))
             ]
             out = torch.stack(out, dim=1).flatten(-2, -1)
         else:
@@ -153,27 +169,22 @@ def forward_channels(self, x_in, centroids):
 
         return out
 
-    # @torch.compile( dynamic=True)
     def forward_columns(self, x_in, centroids):
         # embed provided input data
         x = positional_encoding_harmonic(checkpoint(self.embed, x_in, use_reentrant=False))
 
         for layer in self.layers:
             x = checkpoint(layer, x, use_reentrant=False)
 
-        # append centroids
-        # unembed and reshape
-        out = checkpoint(self.unembed, x, use_reentrant=False)
-        out = out.flatten(-2, -1).reshape(x.shape[0], self.num_tokens, -1)
-        # TODO: unsqueeze will not work with num_tokens > 1
-        out = torch.cat([out, self.embed_centroids(centroids).unsqueeze(1)], -1)
-        # pad to uniform dim_out (that has to be uniform across streams)
-        if self.pad > 0:
-            out = torch.cat((out, self.out_pad.repeat((x.shape[0], self.num_tokens, 1))), -1)
-        # also encode centroids with overlayed positional encoding
+        out = checkpoint(self.unembed1, x, use_reentrant=False)
+        out = self.unembed_nonlin(out)
+        out = checkpoint(self.unembed2, out.transpose(-2, -1), use_reentrant=False)
+        out = out.flatten(-2, -1).unsqueeze(1)
+
+        # final normalize and dropout
         out = self.dropout_final(self.ln_final(out))
 
-        return out
+        return out.to(torch.float16)
 
 
 class StreamEmbedLinear(torch.nn.Module):
diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
@@ -127,16 +127,13 @@ def __init__(self, cf, sources_size, targets_num_channels, targets_coords_size):
     def create(self):
         cf = self.cf
 
-        # KCT:iss130
         # separate embedding networks for differnt observation types
         self.embeds = EmbeddingEngine(cf, self.sources_size).create()
 
-        # KCT:iss130
         # local assimilation engine
         self.ae_local_blocks = LocalAssimilationEngine(cf).create()
 
         ##############
-        # KCT:iss130
         # local -> global assimilation engine adapter
         self.ae_adapter = Local2GlobalAssimilationEngine(cf).create()
 
@@ -168,12 +165,10 @@ def create(self):
         self.q_cells = torch.nn.Parameter(q_cells, requires_grad=True)
 
         ##############
-        # KCT:iss130
         # global assimilation engine
         self.ae_global_blocks = GlobalAssimilationEngine(cf, self.num_healpix_cells).create()
 
         ###############
-        # KCT:iss130
         # forecasting engine
         self.fe_blocks = ForecastingEngine(cf, self.num_healpix_cells).create()
 
@@ -254,7 +249,6 @@ def create(self):
             else:
                 self.pred_adapter_kv.append(torch.nn.Identity())
 
-            # KCT:iss130
             # target prediction engines
             tte = TargetPredictionEngine(
                 cf,
@@ -558,28 +552,24 @@ def predict(self, model_params, fstep, tokens, streams_data, target_coords_idxs)
             zip(self.target_token_engines, self.pred_adapter_kv, strict=False)
         ):
             si = self.cf.streams[ii]
-            tro_type = si["target_readout"]["type"] if "type" in si["target_readout"] else "token"
             tc_embed = self.embed_target_coords[ii]
 
             assert batch_size == 1
 
             # embed token coords, concatenating along batch dimension (which is taking care of through
             # the varlen attention)
-            if tro_type == "obs_value":
-                tc_tokens = torch.cat(
-                    [
-                        checkpoint(
-                            tc_embed,
-                            streams_data[i_b][ii].target_coords[fstep],
-                            use_reentrant=False,
-                        )
-                        if len(streams_data[i_b][ii].target_coords[fstep].shape) > 1
-                        else streams_data[i_b][ii].target_coords[fstep]
-                        for i_b in range(len(streams_data))
-                    ]
-                )
-            else:
-                assert False
+            tc_tokens = torch.cat(
+                [
+                    checkpoint(
+                        tc_embed,
+                        streams_data[i_b][ii].target_coords[fstep],
+                        use_reentrant=False,
+                    )
+                    if len(streams_data[i_b][ii].target_coords[fstep].shape) > 1
+                    else streams_data[i_b][ii].target_coords[fstep]
+                    for i_b in range(len(streams_data))
+                ]
+            )
 
             if torch.isnan(tc_tokens).any():
                 nn = si["name"]