bit-bots
diff --git a/‎ddlitlab2024/ml/model/encoder/action_history.py renamed to ‎ddlitlab2024/ml/model/encoder/base.py
Lines changed: 8 additions & 9 deletions b/‎ddlitlab2024/ml/model/encoder/action_history.py renamed to ‎ddlitlab2024/ml/model/encoder/base.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎ddlitlab2024/ml/model/encoder/image.py
Lines changed: 143 additions & 0 deletions b/‎ddlitlab2024/ml/model/encoder/image.py
Lines changed: 143 additions & 0 deletions
diff --git a/‎ddlitlab2024/ml/model/encoder/imu.py
Lines changed: 50 additions & 0 deletions b/‎ddlitlab2024/ml/model/encoder/imu.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎ddlitlab2024/ml/model/encoder/joint.py
Lines changed: 25 additions & 0 deletions b/‎ddlitlab2024/ml/model/encoder/joint.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎ddlitlab2024/ml/model/encoder/joint_states.py
Lines changed: 0 additions & 9 deletions b/‎ddlitlab2024/ml/model/encoder/joint_states.py
Lines changed: 0 additions & 9 deletions
@@ -4,23 +4,23 @@
 from ddlitlab2024.ml.model.misc import PositionalEncoding
 
 
-class ActionHistoryEncoder(nn.Module):
+class BaseEncoder(nn.Module):
     """
-    Transformer encoder that encodes the action history of the robot.
+    Transformer encoder that encodes a sequence of input vectors into context tokens.
     """
 
-    def __init__(self, num_joints, hidden_dim, num_layers, num_heads, max_seq_len):
+    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int, num_heads: int, max_seq_len: int):
         """
         Initializes the module.
 
-        :param num_joints: The number of joints in the robot.
+        :param input_dim: The number of input dimensions.
         :param hidden_dim: The number of hidden dimensions.
         :param num_layers: The number of transformer layers.
         :param num_heads: The number of attention heads.
         :param max_seq_len: The maximum length of the input sequences (used for positional encoding
         """
         super().__init__()
-        self.embedding = nn.Linear(num_joints, hidden_dim)
+        self.embedding = nn.Linear(input_dim, hidden_dim)
         self.positional_encoding = PositionalEncoding(hidden_dim, max_seq_len)
         self.transformer_encoder = nn.TransformerEncoder(
             nn.TransformerEncoderLayer(
@@ -34,14 +34,13 @@ def __init__(self, num_joints, hidden_dim, num_layers, num_heads, max_seq_len):
             num_layers=num_layers,
         )
 
-    def forward(self, past_actions: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Encodes the past actions of the robot as context tokens.
+        Encodes the input vectors into context tokens.
 
-        :param past_actions: The past actions of the robot. Shape: (batch_size, seq_len, joint)
+        :param past_actions: The input vectors. Shape: (batch_size, seq_len, input_dim)
         :return: The encoded context tokens. Shape: (batch_size, seq_len, hidden_dim)
         """
-        x = past_actions
         # Embed the input
         x = self.embedding(x)
         # Positional encoding
 
@@ -0,0 +1,143 @@
+from enum import Enum
+
+import torch
+from torch import nn
+from torchvision.models import resnet18, resnet50, swin_s, swin_t
+
+from ddlitlab2024.ml.model.encoder.base import BaseEncoder
+
+
+class ImageEncoderType(Enum):
+    """
+    Enum class for the image encoder types.
+    """
+
+    RESNET18 = "resnet18"
+    RESNET50 = "resnet50"
+    SWIN_TRANSFORMER_TINY = "swin_transformer_tiny"
+    SWIN_TRANSFORMER_SMALL = "swin_transformer_small"
+
+
+class SequenceEncoderType(Enum):
+    """
+    Enum class for the sequence encoder types.
+    """
+
+    TRANSFORMER = "transformer"
+    NONE = "none"
+
+
+class AbstractImageEncoder(nn.Module):
+    """
+    Abstract class for image encoders.
+    """
+
+    encoder: nn.Module
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the image encoder.
+
+        :param x: A sequence of images.
+        :return: A sequence of encoded images.
+        """
+        # Squash the sequence dimension together with the batch dimension
+        images = x.view(-1, *x.shape[2:])
+
+        # Encode the images into tokens
+        tokens = self.encoder(images)
+
+        # Restore the original sequence dimension
+        return tokens.view(x.shape[0], x.shape[1], -1)
+
+
+class ResNetImageEncoder(AbstractImageEncoder):
+    """
+    ResNet image encoder.
+    """
+
+    def __init__(self, resnet_type: ImageEncoderType, hidden_dim: int):
+        super().__init__()
+        match resnet_type:
+            case ImageEncoderType.RESNET18:
+                self.encoder = resnet18(pretrained=True)
+            case ImageEncoderType.RESNET50:
+                self.encoder = resnet50(pretrained=True)
+            case _:
+                raise ValueError(f"Invalid ResNet type: {resnet_type}")
+        # TODO check for softmax layer etc.
+        self.encoder.fc = nn.Linear(self.encoder.fc.in_features, hidden_dim)
+
+
+class SwinTransformerImageEncoder(AbstractImageEncoder):
+    """
+    Swin Transformer image encoder.
+    """
+
+    def __init__(self, swin_type: ImageEncoderType, hidden_dim: int):
+        super().__init__()
+        match swin_type:
+            case ImageEncoderType.SWIN_TRANSFORMER_TINY:
+                self.encoder = swin_t()
+            case ImageEncoderType.SWIN_TRANSFORMER_SMALL:
+                self.encoder = swin_s()
+            case _:
+                raise ValueError(f"Invalid Swin Transformer type: {swin_type}")
+        self.encoder.head = nn.Linear(self.encoder.head.in_features, hidden_dim)
+
+
+class TransformerImageSequenceEncoder(nn.Module):
+    """
+    Transformer image sequence encoder.
+    """
+
+    def __init__(self, image_encoder: AbstractImageEncoder, hidden_dim: int, num_layers: int, max_seq_len: int):
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.transformer_encoder = BaseEncoder(
+            input_dim=hidden_dim, hidden_dim=hidden_dim, num_layers=num_layers, num_heads=8, max_seq_len=max_seq_len
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.transformer_encoder(self.image_encoder(x))
+
+
+def image_encoder_factory(encoder_type: ImageEncoderType, hidden_dim: int) -> AbstractImageEncoder:
+    """
+    Factory function for creating image encoders.
+
+    :param encoder_type: The type of the image encoder.
+    :return: The image encoder.
+    """
+    if encoder_type in [ImageEncoderType.RESNET18, ImageEncoderType.RESNET50]:
+        return ResNetImageEncoder(encoder_type, hidden_dim)
+    if encoder_type in [ImageEncoderType.SWIN_TRANSFORMER_TINY, ImageEncoderType.SWIN_TRANSFORMER_SMALL]:
+        return SwinTransformerImageEncoder(encoder_type, hidden_dim)
+    else:
+        raise ValueError(f"Invalid image encoder type: {encoder_type}")
+
+
+def image_sequence_encoder_factory(
+    encoder_type: SequenceEncoderType,
+    image_encoder_type: ImageEncoderType,
+    hidden_dim: int,
+    num_layers: int,
+    max_seq_len: int,
+):
+    """
+    Factory function for creating image sequence encoders.
+
+    :param encoder_type: The type of the sequence encoder that allows communication between different images.
+        If no sequence encoder is needed, the image encoder is returned.
+    :param image_encoder_type: The type of the image encoder.
+    :return: The image sequence encoder.
+    """
+    image_encoder = image_encoder_factory(image_encoder_type, hidden_dim)
+
+    match encoder_type:
+        case SequenceEncoderType.TRANSFORMER:
+            return TransformerImageSequenceEncoder(image_encoder, hidden_dim, num_layers, max_seq_len)
+        case SequenceEncoderType.NONE:
+            return image_encoder
+        case _:
+            raise ValueError(f"Invalid sequence encoder type: {encoder_type}")
@@ -0,0 +1,50 @@
+from enum import Enum
+
+from ddlitlab2024.ml.model.encoder.base import BaseEncoder
+
+
+class IMUEncoder(BaseEncoder):
+    """
+    Transformer encoder that encodes the action history of the robot.
+    """
+
+    class OrientationEmbeddingMethod(Enum):
+        """
+        Enum class for the orientation embedding methods.
+        """
+
+        QUATERNION = "quaternion"
+        FIVE_DIM = "five_dim"  # Axis-angle with 2d vector for the angle
+
+    def __init__(
+        self,
+        orientation_embedding_method: OrientationEmbeddingMethod,
+        hidden_dim: int,
+        num_layers: int,
+        num_heads: int,
+        max_seq_len: int,
+    ):
+        """
+        Initializes the module.
+
+        :param orientation_embedding_method: The method used to embed the orientation data.
+        :param hidden_dim: The number of hidden dimensions.
+        :param num_layers: The number of transformer layers.
+        :param num_heads: The number of attention heads.
+        :param max_seq_len: The maximum length of the input sequences (used for positional encoding
+        """
+
+        # Calculate the number of input features
+        match orientation_embedding_method:
+            case IMUEncoder.OrientationEmbeddingMethod.QUATERNION:
+                input_features = 4
+            case IMUEncoder.OrientationEmbeddingMethod.FIVE_DIM:
+                input_features = 5
+
+        super().__init__(
+            input_dim=input_features,
+            hidden_dim=hidden_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            max_seq_len=max_seq_len,
+        )
@@ -0,0 +1,25 @@
+from ddlitlab2024.ml.model.encoder.base import BaseEncoder
+
+
+class JointEncoder(BaseEncoder):
+    """
+    Joint encoder that encodes the joint states of the robot.
+    """
+
+    def __init__(self, num_joints: int, hidden_dim: int, num_layers: int, num_heads: int, max_seq_len: int):
+        """
+        Initializes the module.
+
+        :param num_joints: The number of joints in the robot.
+        :param hidden_dim: The number of hidden dimensions.
+        :param num_layers: The number of transformer layers.
+        :param num_heads: The number of attention heads.
+        :param max_seq_len: The maximum length of the input sequences (used for positional encoding
+        """
+        super().__init__(
+            input_dim=num_joints,
+            hidden_dim=hidden_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            max_seq_len=max_seq_len,
+        )