Merge pull request #102 from mahdip72/main

amorehead · web-flow · commit da7cfe6d3e46 · 2025-04-27T13:17:58.000-05:00
Enable AMP Compatibility for GCP Model to Reduce VRAM Usage
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@
 * Add support for handling backward OOMs gracefully [#83](https://github.com/a-r-j/ProteinWorkshop/pull/83)
 * Update GCPNet paper link [#85](https://github.com/a-r-j/ProteinWorkshop/pull/85)
 * Add ability for `BenchmarkModel` to have its decoder disabled [#101](https://github.com/a-r-j/ProteinWorkshop/pull/101)
+* Fix dtype mismatch in `gcp.py` that broke Automatic Mixed Precision (AMP) training [#102](https://github.com/a-r-j/ProteinWorkshop/pull/102)
 
 ### Framework
 
diff --git a/proteinworkshop/models/graph_encoders/layers/gcp.py b/proteinworkshop/models/graph_encoders/layers/gcp.py
@@ -265,14 +265,20 @@ def scalarize(
 
         if node_mask is not None:
             edge_mask = node_mask[row] & node_mask[col]
+            # Initialize destination tensor
             local_scalar_rep_i = torch.zeros(
                 (edge_index.shape[1], 3, 3), device=edge_index.device
             )
-            local_scalar_rep_i[edge_mask] = torch.matmul(
+            # Calculate the source value (result of matmul, likely Half under AMP)
+            matmul_result = torch.matmul(
                 frames[edge_mask], vector_rep_i[edge_mask]
             )
+            # Explicitly cast the source value to the destination's dtype before assignment
+            local_scalar_rep_i[edge_mask] = matmul_result.to(local_scalar_rep_i.dtype)
+
             local_scalar_rep_i = local_scalar_rep_i.transpose(-1, -2)
         else:
+            # This path might need similar treatment if it causes issues
             local_scalar_rep_i = torch.matmul(frames, vector_rep_i).transpose(-1, -2)
 
         # potentially enable E(3)-equivariance and, thereby, chirality-invariance