Add flatten layer and update dependencies (#18)

araffin · web-flow · commit 9bd4bcaadb9d · 2023-11-06T11:32:05.000+01:00
* Add flatten layer and update dependencies

* Reformat
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,13 +32,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch
-        pip install torch==1.13+cpu -f https://download.pytorch.org/whl/torch_stable.html
-
-        # # Install Atari Roms
-        # pip install autorom
-        # wget https://gist.githubusercontent.com/jjshoots/61b22aefce4456920ba99f2c36906eda/raw/00046ac3403768bfe45857610a3d333b8e35e026/Roms.tar.gz.b64
-        # base64 Roms.tar.gz.b64 --decode &> Roms.tar.gz
-        # AutoROM --accept-license --source-file Roms.tar.gz
+        pip install torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
 
         pip install .[tests]
         # Use headless version
diff --git a/Makefile b/Makefile
@@ -18,13 +18,13 @@ lint:
 
 format:
 	# Sort imports
-	isort ${LINT_PATHS}
+	ruff --select I ${LINT_PATHS} --fix
 	# Reformat using black
 	black ${LINT_PATHS}
 
 check-codestyle:
 	# Sort imports
-	isort --check ${LINT_PATHS}
+	ruff --select I ${LINT_PATHS}
 	# Reformat using black
 	black --check ${LINT_PATHS}
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,11 +16,6 @@ max-complexity = 15
 [tool.black]
 line-length = 127
 
-[tool.isort]
-profile = "black"
-line_length = 127
-src_paths = ["sbx"]
-
 [tool.mypy]
 ignore_missing_imports = true
 follow_imports = "silent"
diff --git a/sbx/common/policies.py b/sbx/common/policies.py
@@ -1,14 +1,26 @@
 # import copy
 from typing import Dict, Optional, Tuple, Union, no_type_check
 
+import flax.linen as nn
 import jax
+import jax.numpy as jnp
 import numpy as np
 from gymnasium import spaces
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.preprocessing import is_image_space, maybe_transpose
 from stable_baselines3.common.utils import is_vectorized_observation
 
 
+class Flatten(nn.Module):
+    """
+    Equivalent to PyTorch nn.Flatten() layer.
+    """
+
+    @nn.compact
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return x.reshape((x.shape[0], -1))
+
+
 class BaseJaxPolicy(BasePolicy):
     def __init__(self, *args, **kwargs):
         super().__init__(
diff --git a/sbx/dqn/policies.py b/sbx/dqn/policies.py
@@ -8,7 +8,7 @@
 from gymnasium import spaces
 from stable_baselines3.common.type_aliases import Schedule
 
-from sbx.common.policies import BaseJaxPolicy
+from sbx.common.policies import BaseJaxPolicy, Flatten
 from sbx.common.type_aliases import RLTrainState
 
 
@@ -18,6 +18,7 @@ class QNetwork(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        x = Flatten()(x)
         x = nn.Dense(self.n_units)(x)
         x = nn.relu(x)
         x = nn.Dense(self.n_units)(x)
diff --git a/sbx/ppo/policies.py b/sbx/ppo/policies.py
@@ -12,7 +12,7 @@
 from gymnasium import spaces
 from stable_baselines3.common.type_aliases import Schedule
 
-from sbx.common.policies import BaseJaxPolicy
+from sbx.common.policies import BaseJaxPolicy, Flatten
 
 tfp = tensorflow_probability.substrates.jax
 tfd = tfp.distributions
@@ -24,6 +24,7 @@ class Critic(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        x = Flatten()(x)
         x = nn.Dense(self.n_units)(x)
         x = self.activation_fn(x)
         x = nn.Dense(self.n_units)(x)
@@ -45,6 +46,7 @@ def get_std(self):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> tfd.Distribution:  # type: ignore[name-defined]
+        x = Flatten()(x)
         x = nn.Dense(self.n_units)(x)
         x = self.activation_fn(x)
         x = nn.Dense(self.n_units)(x)
diff --git a/sbx/sac/policies.py b/sbx/sac/policies.py
@@ -11,7 +11,7 @@
 from stable_baselines3.common.type_aliases import Schedule
 
 from sbx.common.distributions import TanhTransformedDistribution
-from sbx.common.policies import BaseJaxPolicy
+from sbx.common.policies import BaseJaxPolicy, Flatten
 from sbx.common.type_aliases import RLTrainState
 
 tfp = tensorflow_probability.substrates.jax
@@ -25,6 +25,7 @@ class Critic(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray, action: jnp.ndarray) -> jnp.ndarray:
+        x = Flatten()(x)
         x = jnp.concatenate([x, action], -1)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
@@ -75,6 +76,7 @@ def get_std(self):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> tfd.Distribution:  # type: ignore[name-defined]
+        x = Flatten()(x)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
             x = nn.relu(x)
diff --git a/sbx/td3/policies.py b/sbx/td3/policies.py
@@ -8,7 +8,7 @@
 from gymnasium import spaces
 from stable_baselines3.common.type_aliases import Schedule
 
-from sbx.common.policies import BaseJaxPolicy
+from sbx.common.policies import BaseJaxPolicy, Flatten
 from sbx.common.type_aliases import RLTrainState
 
 
@@ -19,6 +19,7 @@ class Critic(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray, action: jnp.ndarray) -> jnp.ndarray:
+        x = Flatten()(x)
         x = jnp.concatenate([x, action], -1)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
@@ -63,6 +64,7 @@ class Actor(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> jnp.ndarray:  # type: ignore[name-defined]
+        x = Flatten()(x)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
             x = nn.relu(x)
diff --git a/sbx/tqc/policies.py b/sbx/tqc/policies.py
@@ -11,7 +11,7 @@
 from stable_baselines3.common.type_aliases import Schedule
 
 from sbx.common.distributions import TanhTransformedDistribution
-from sbx.common.policies import BaseJaxPolicy
+from sbx.common.policies import BaseJaxPolicy, Flatten
 from sbx.common.type_aliases import RLTrainState
 
 tfp = tensorflow_probability.substrates.jax
@@ -26,6 +26,7 @@ class Critic(nn.Module):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray, a: jnp.ndarray, training: bool = False) -> jnp.ndarray:
+        x = Flatten()(x)
         x = jnp.concatenate([x, a], -1)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
@@ -50,6 +51,7 @@ def get_std(self):
 
     @nn.compact
     def __call__(self, x: jnp.ndarray) -> tfd.Distribution:  # type: ignore[name-defined]
+        x = Flatten()(x)
         for n_units in self.net_arch:
             x = nn.Dense(n_units)(x)
             x = nn.relu(x)
diff --git a/sbx/version.txt b/sbx/version.txt
@@ -1 +1 @@
-0.8.0
+0.9.0
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
     packages=[package for package in find_packages() if package.startswith("sbx")],
     package_data={"sbx": ["py.typed", "version.txt"]},
     install_requires=[
-        "stable_baselines3>=2.1.0",
+        "stable_baselines3>=2.2.0a9",
         "jax",
         "jaxlib",
         "flax",
@@ -59,8 +59,6 @@
             "mypy",
             # Lint code
             "ruff",
-            # Sort imports
-            "isort>=5.0",
             # Reformat
             "black",
         ],
diff --git a/tests/test_flatten.py b/tests/test_flatten.py
@@ -0,0 +1,31 @@
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+import gymnasium as gym
+import numpy as np
+import pytest
+from gymnasium import spaces
+
+from sbx import DQN, PPO, SAC, TD3, TQC
+
+
+@dataclass
+class DummyEnv(gym.Env):
+    observation_space: spaces.Space
+    action_space: spaces.Space
+
+    def step(self, action):
+        return self.observation_space.sample(), 0.0, False, False, {}
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None):
+        if seed is not None:
+            super().reset(seed=seed)
+        return self.observation_space.sample(), {}
+
+
+@pytest.mark.parametrize("model_class", [DQN, PPO, SAC, TD3, TQC])
+def test_flatten(model_class) -> None:
+    action_space = spaces.Discrete(15) if model_class == DQN else spaces.Box(-1, 1, shape=(2,), dtype=np.float32)
+    env = DummyEnv(spaces.Box(-1, 1, shape=(2, 1), dtype=np.float32), action_space)
+
+    model_class("MlpPolicy", env).learn(150)