google
diff --git a/‎dopamine/agents/dqn/dqn_agent.py‎
Lines changed: 15 additions & 15 deletions b/‎dopamine/agents/dqn/dqn_agent.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎dopamine/replay_memory/circular_replay_buffer.py‎
Lines changed: 5 additions & 8 deletions b/‎dopamine/replay_memory/circular_replay_buffer.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎dopamine/replay_memory/prioritized_replay_buffer.py‎
Lines changed: 2 additions & 4 deletions b/‎dopamine/replay_memory/prioritized_replay_buffer.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tests/dopamine/agents/dqn/dqn_agent_test.py‎
Lines changed: 86 additions & 19 deletions b/‎tests/dopamine/agents/dqn/dqn_agent_test.py‎
Lines changed: 86 additions & 19 deletions
diff --git a/‎tests/dopamine/agents/implicit_quantile/implicit_quantile_agent_test.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/dopamine/agents/implicit_quantile/implicit_quantile_agent_test.py‎
Lines changed: 1 addition & 1 deletion
@@ -34,7 +34,7 @@
 slim = tf.contrib.slim
 
 
-NATURE_DQN_OBSERVATION_SHAPE = 84  # Size of a downscaled Atari 2600 frame.
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
 NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
 NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
 
@@ -98,8 +98,7 @@ def __init__(self,
     Args:
       sess: `tf.Session`, for executing ops.
       num_actions: int, number of actions the agent can take at any state.
-      observation_shape: tuple of ints or an int. If single int, the observation
-        is assumed to be a 2D square.
+      observation_shape: tuple of ints describing the observation shape.
       observation_dtype: tf.DType, specifies the type of the observations. Note
         that if your inputs are continuous, you should set this to tf.float32.
       stack_size: int, number of frames to use in state stack.
@@ -128,7 +127,7 @@ def __init__(self,
       summary_writing_frequency: int, frequency with which summaries will be
         written. Lower values will result in slower training.
     """
-
+    assert isinstance(observation_shape, tuple)
     tf.logging.info('Creating %s agent with the following parameters:',
                     self.__class__.__name__)
     tf.logging.info('\t gamma: %f', gamma)
@@ -144,11 +143,8 @@ def __init__(self,
     tf.logging.info('\t optimizer: %s', optimizer)
 
     self.num_actions = num_actions
-    if (isinstance(observation_shape, tuple) or
-        isinstance(observation_shape, list)):
-      self.observation_shape = tuple(observation_shape)
-    else:
-      self.observation_shape = (observation_shape, observation_shape)
+    self.observation_shape = tuple(observation_shape)
+    self.observation_dtype = observation_dtype
     self.stack_size = stack_size
     self.gamma = gamma
     self.update_horizon = update_horizon
@@ -171,7 +167,7 @@ def __init__(self,
       # The last axis indicates the number of consecutive frames stacked.
       state_shape = (1,) + self.observation_shape + (stack_size,)
       self.state = np.zeros(state_shape)
-      self.state_ph = tf.placeholder(observation_dtype, state_shape,
+      self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
                                      name='state_ph')
       self._replay = self._build_replay_buffer(use_staging)
 
@@ -260,7 +256,8 @@ def _build_replay_buffer(self, use_staging):
         stack_size=self.stack_size,
         use_staging=use_staging,
         update_horizon=self.update_horizon,
-        gamma=self.gamma)
+        gamma=self.gamma,
+        observation_dtype=self.observation_dtype.as_numpy_dtype)
 
   def _build_target_q_op(self):
     """Build an op used as a target for the Q-value.
@@ -428,11 +425,14 @@ def _record_observation(self, observation):
     Args:
       observation: numpy array, an observation from the environment.
     """
-    # Set current observation. Represents an 84 x 84 x 1 image frame.
-    self._observation = observation[:, :, 0]
+    # Set current observation. We do the reshaping to handle environments
+    # without frame stacking.
+    observation = np.reshape(observation, self.observation_shape)
+    self._observation = observation[..., 0]
+    self._observation = np.reshape(observation, self.observation_shape)
     # Swap out the oldest frame with the current frame.
-    self.state = np.roll(self.state, -1, axis=3)
-    self.state[0, :, :, -1] = self._observation
+    self.state = np.roll(self.state, -1, axis=-1)
+    self.state[0, ..., -1] = self._observation
 
   def _store_transition(self, last_observation, action, reward, is_terminal):
     """Stores an experienced transition.
 
@@ -107,8 +107,7 @@ def __init__(self,
     """Initializes OutOfGraphReplayBuffer.
 
     Args:
-      observation_shape: tuple or int. If int, the observation is
-        assumed to be a 2D square.
+      observation_shape: tuple of ints.
       stack_size: int, number of frames to use in state stack.
       replay_capacity: int, number of transitions to keep in memory.
       batch_size: int.
@@ -125,6 +124,7 @@ def __init__(self,
       ValueError: If replay_capacity is too small to hold at least one
         transition.
     """
+    assert isinstance(observation_shape, tuple)
     if replay_capacity < update_horizon + stack_size:
       raise ValueError('There is not enough capacity to cover '
                        'update_horizon and stack_size.')
@@ -133,16 +133,14 @@ def __init__(self,
         'Creating a %s replay memory with the following parameters:',
         self.__class__.__name__)
     tf.logging.info('\t observation_shape: %s', str(observation_shape))
+    tf.logging.info('\t observation_dtype: %s', str(observation_dtype))
     tf.logging.info('\t stack_size: %d', stack_size)
     tf.logging.info('\t replay_capacity: %d', replay_capacity)
     tf.logging.info('\t batch_size: %d', batch_size)
     tf.logging.info('\t update_horizon: %d', update_horizon)
     tf.logging.info('\t gamma: %f', gamma)
 
-    if isinstance(observation_shape, tuple):
-      self._observation_shape = observation_shape
-    else:
-      self._observation_shape = (observation_shape, observation_shape)
+    self._observation_shape = observation_shape
     self._stack_size = stack_size
     self._state_shape = self._observation_shape + (self._stack_size,)
     self._replay_capacity = replay_capacity
@@ -663,8 +661,7 @@ def __init__(self,
     """Initializes WrappedReplayBuffer.
 
     Args:
-      observation_shape: tuple or int. If int, the observation is
-        assumed to be a 2D square.
+      observation_shape: tuple of ints.
       stack_size: int, number of frames to use in state stack.
       use_staging: bool, when True it would use a staging area to prefetch
         the next sampling batch.
 
@@ -53,8 +53,7 @@ def __init__(self,
     """Initializes OutOfGraphPrioritizedReplayBuffer.
 
     Args:
-      observation_shape: tuple or int. If int, the observation is
-        assumed to be a 2D square with sides equal to observation_shape.
+      observation_shape: tuple of ints.
       stack_size: int, number of frames to use in state stack.
       replay_capacity: int, number of transitions to keep in memory.
       batch_size: int.
@@ -264,8 +263,7 @@ def __init__(self,
     """Initializes WrappedPrioritizedReplayBuffer.
 
     Args:
-      observation_shape: tuple or int. If int, the observation is
-        assumed to be a 2D square with sides equal to observation_shape.
+      observation_shape: tuple of ints.
       stack_size: int, number of frames to use in state stack.
       use_staging: bool, when True it would use a staging area to prefetch
         the next sampling batch.
 
@@ -50,7 +50,7 @@ def setUp(self):
     self.observation_dtype = dqn_agent.NATURE_DQN_DTYPE
     self.stack_size = dqn_agent.NATURE_DQN_STACK_SIZE
     self.zero_state = np.zeros(
-        [1, self.observation_shape, self.observation_shape, self.stack_size])
+        (1,) + self.observation_shape + (self.stack_size,))
 
   def _create_test_agent(self, sess):
     stack_size = self.stack_size
@@ -76,6 +76,9 @@ def _network_template(self, state):
 
     agent = MockDQNAgent(
         sess=sess,
+        observation_shape=self.observation_shape,
+        observation_dtype=self.observation_dtype,
+        stack_size=self.stack_size,
         num_actions=self.num_actions,
         min_replay_history=self.min_replay_history,
         epsilon_fn=lambda w, x, y, z: 0.0,  # No exploration.
@@ -108,14 +111,12 @@ def testBeginEpisode(self):
       # We fill up the state with 9s. On calling agent.begin_episode the state
       # should be reset to all 0s.
       agent.state.fill(9)
-      first_observation = np.ones(
-          [self.observation_shape, self.observation_shape, 1])
+      first_observation = np.ones(self.observation_shape + (1,))
       self.assertEqual(agent.begin_episode(first_observation), 0)
       # When the all-1s observation is received, it will be placed at the end of
       # the state.
       expected_state = self.zero_state
-      expected_state[:, :, :, -1] = np.ones(
-          [1, self.observation_shape, self.observation_shape])
+      expected_state[:, :, :, -1] = np.ones((1,) + self.observation_shape)
       self.assertAllEqual(agent.state, expected_state)
       self.assertAllEqual(agent._observation, first_observation[:, :, 0])
       # No training happens in eval mode.
@@ -126,13 +127,11 @@ def testBeginEpisode(self):
       # Having a low replay memory add_count will prevent any of the
       # train/prefetch/sync ops from being called.
       agent._replay.memory.add_count = 0
-      second_observation = np.ones(
-          [self.observation_shape, self.observation_shape, 1]) * 2
+      second_observation = np.ones(self.observation_shape + (1,)) * 2
       agent.begin_episode(second_observation)
       # The agent's state will be reset, so we will only be left with the all-2s
       # observation.
-      expected_state[:, :, :, -1] = np.full(
-          (1, self.observation_shape, self.observation_shape), 2)
+      expected_state[:, :, :, -1] = np.full((1,) + self.observation_shape, 2)
       self.assertAllEqual(agent.state, expected_state)
       self.assertAllEqual(agent._observation, second_observation[:, :, 0])
       # training_steps is incremented since we set eval_mode to False.
@@ -145,8 +144,7 @@ def testStepEval(self):
     """
     with tf.Session() as sess:
       agent = self._create_test_agent(sess)
-      base_observation = np.ones(
-          [self.observation_shape, self.observation_shape, 1])
+      base_observation = np.ones(self.observation_shape + (1,))
       # This will reset state and choose a first action.
       agent.begin_episode(base_observation)
       # We mock the replay buffer to verify how the agent interacts with it.
@@ -163,12 +161,11 @@ def testStepEval(self):
         stack_pos = step - num_steps - 1
         if stack_pos >= -self.stack_size:
           expected_state[:, :, :, stack_pos] = np.full(
-              (1, self.observation_shape, self.observation_shape), step)
+              (1,) + self.observation_shape, step)
       self.assertAllEqual(agent.state, expected_state)
       self.assertAllEqual(
           agent._last_observation,
-          np.ones([self.observation_shape, self.observation_shape]) *
-          (num_steps - 1))
+          np.ones(self.observation_shape) * (num_steps - 1))
       self.assertAllEqual(agent._observation, observation[:, :, 0])
       # No training happens in eval mode.
       self.assertEqual(agent.training_steps, 0)
@@ -183,8 +180,7 @@ def testStepTrain(self):
     with tf.Session() as sess:
       agent = self._create_test_agent(sess)
       agent.eval_mode = False
-      base_observation = np.ones(
-          [self.observation_shape, self.observation_shape, 1])
+      base_observation = np.ones(self.observation_shape + (1,))
       # We mock the replay buffer to verify how the agent interacts with it.
       agent._replay = test_utils.MockReplayBuffer()
       self.evaluate(tf.global_variables_initializer())
@@ -203,7 +199,7 @@ def testStepTrain(self):
         stack_pos = step - num_steps - 1
         if stack_pos >= -self.stack_size:
           expected_state[:, :, :, stack_pos] = np.full(
-              (1, self.observation_shape, self.observation_shape), step)
+              (1,) + self.observation_shape, step)
         self.assertEqual(agent._replay.add.call_count, step)
         mock_args, _ = agent._replay.add.call_args
         self.assertAllEqual(last_observation[:, :, 0], mock_args[0])
@@ -213,8 +209,7 @@ def testStepTrain(self):
       self.assertAllEqual(agent.state, expected_state)
       self.assertAllEqual(
           agent._last_observation,
-          np.full((self.observation_shape, self.observation_shape),
-                  num_steps - 1))
+          np.full(self.observation_shape, num_steps - 1))
       self.assertAllEqual(agent._observation, observation[:, :, 0])
       # We expect one more than num_steps because of the call to begin_episode.
       self.assertEqual(agent.training_steps, num_steps + 1)
@@ -228,6 +223,78 @@ def testStepTrain(self):
       self.assertAllEqual(1, mock_args[2])  # Reward received.
       self.assertTrue(mock_args[3])  # is_terminal
 
+  def testNonTupleObservationShape(self):
+    with self.assertRaises(AssertionError):
+      self.observation_shape = 84
+      with tf.Session() as sess:
+        _ = self._create_test_agent(sess)
+
+  def _testCustomShapes(self, shape, dtype, stack_size):
+    self.observation_shape = shape
+    self.observation_dtype = dtype
+    self.stack_size = stack_size
+    self.zero_state = np.zeros((1,) + shape + (stack_size,))
+    with tf.Session() as sess:
+      agent = self._create_test_agent(sess)
+      agent.eval_mode = False
+      base_observation = np.ones(self.observation_shape + (1,))
+      # We mock the replay buffer to verify how the agent interacts with it.
+      agent._replay = test_utils.MockReplayBuffer()
+      self.evaluate(tf.global_variables_initializer())
+      # This will reset state and choose a first action.
+      agent.begin_episode(base_observation)
+      observation = base_observation
+
+      expected_state = self.zero_state
+      num_steps = 10
+      for step in range(1, num_steps + 1):
+        # We make observation a multiple of step for testing purposes (to
+        # uniquely identify each observation).
+        last_observation = observation
+        observation = base_observation * step
+        self.assertEqual(agent.step(reward=1, observation=observation), 0)
+        stack_pos = step - num_steps - 1
+        if stack_pos >= -self.stack_size:
+          expected_state[..., stack_pos] = np.full(
+              (1,) + self.observation_shape, step)
+        self.assertEqual(agent._replay.add.call_count, step)
+        mock_args, _ = agent._replay.add.call_args
+        self.assertAllEqual(last_observation[..., 0], mock_args[0])
+        self.assertAllEqual(0, mock_args[1])  # Action selected.
+        self.assertAllEqual(1, mock_args[2])  # Reward received.
+        self.assertFalse(mock_args[3])  # is_terminal
+      self.assertAllEqual(agent.state, expected_state)
+      self.assertAllEqual(
+          agent._last_observation,
+          np.full(self.observation_shape, num_steps - 1))
+      self.assertAllEqual(agent._observation, observation[..., 0])
+      # We expect one more than num_steps because of the call to begin_episode.
+      self.assertEqual(agent.training_steps, num_steps + 1)
+      self.assertEqual(agent._replay.add.call_count, num_steps)
+
+      agent.end_episode(reward=1)
+      self.assertEqual(agent._replay.add.call_count, num_steps + 1)
+      mock_args, _ = agent._replay.add.call_args
+      self.assertAllEqual(observation[..., 0], mock_args[0])
+      self.assertAllEqual(0, mock_args[1])  # Action selected.
+      self.assertAllEqual(1, mock_args[2])  # Reward received.
+      self.assertTrue(mock_args[3])  # is_terminal
+
+  def testStepTrainCustomObservationShapes(self):
+    custom_shapes = [(1,), (4, 4), (6, 1), (1, 6), (1, 1, 6), (6, 6, 6, 6)]
+    for shape in custom_shapes:
+      self._testCustomShapes(shape, tf.uint8, 1)
+
+  def testStepTrainCustomTypes(self):
+    custom_types = [tf.float32, tf.uint8, tf.int64]
+    for dtype in custom_types:
+      self._testCustomShapes((4, 4), dtype, 1)
+
+  def testStepTrainCustomStackSizes(self):
+    custom_stack_sizes = [1, 4, 8]
+    for stack_size in custom_stack_sizes:
+      self._testCustomShapes((4, 4), tf.uint8, stack_size)
+
   def testLinearlyDecayingEpsilon(self):
     """Test the functionality of the linearly_decaying_epsilon function."""
     decay_period = 100
 
@@ -37,7 +37,7 @@ def setUp(self):
     self.observation_dtype = dqn_agent.NATURE_DQN_DTYPE
     self.stack_size = dqn_agent.NATURE_DQN_STACK_SIZE
     self.ones_state = np.ones(
-        [1, self.observation_shape, self.observation_shape, self.stack_size])
+        (1,) + self.observation_shape + (self.stack_size,))
 
   def _create_test_agent(self, sess):