Skip to content

Variable Horizon in seals/CartPole  #56

Open
@lcotetur

Description

@lcotetur
from imitation.algorithms.adversarial.airl import AIRL
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

import gym
import seals

learners_rewards_after_training = []
learners_rewards_before_training = []
venv = DummyVecEnv([lambda: gym.make("seals/CartPole-v0")] * 8)
learner = PPO(
        env=venv,
        policy=MlpPolicy,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
    )
reward_net = BasicShapedRewardNet(
        venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
    )
airl_trainer = AIRL(
        demonstrations=rollouts,
        demo_batch_size=1024,
        gen_replay_buffer_capacity=2048,
        n_disc_updates_per_round=4,
        venv=venv,
        gen_algo=learner,
        reward_net=reward_net
    )

for i in range(10):
     
    learner_rewards_before_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
    )
    learners_rewards_before_training.append(learner_rewards_before_training)


    airl_trainer.train(20000)  # Note: set to 300000 for better results
    learner_rewards_after_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
        ) 
    learners_rewards_after_training.append(learner_rewards_after_training)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\944136942.py in <module>
     41 
     42 
---> 43     airl_trainer.train(20000)  # Note: set to 300000 for better results
     44     learner_rewards_after_training, _ = evaluate_policy(
     45         learner, venv, 100, return_episode_rewards=True

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train(self, total_timesteps, callback)
    416         )
    417         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
--> 418             self.train_gen(self.gen_train_timesteps)
    419             for _ in range(self.n_disc_updates_per_round):
    420                 with networks.training(self.reward_train):

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train_gen(self, total_timesteps, learn_kwargs)
    385 
    386         gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
--> 387         self._check_fixed_horizon(ep_lens)
    388         gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
    389         self._gen_replay_buffer.store(gen_samples)

c:\users\stephane\documents\imitation\src\imitation\algorithms\base.py in _check_fixed_horizon(self, horizons)
     89         if len(horizons) > 1:
     90             raise ValueError(
---> 91                 f"Episodes of different length detected: {horizons}. "
     92                 "Variable horizon environments are discouraged -- "
     93                 "termination conditions leak information about reward. See"

ValueError: Episodes of different length detected: {548, 500}. Variable horizon environments are discouraged -- termination conditions leak information about reward. Seehttps://imitation.readthedocs.io/en/latest/guide/variable_horizon.html for more information. If you are SURE you want to run imitation on a variable horizon task, then please pass in the flag: `allow_variable_horizon=True`.

When trying to run demo from https://github.com/HumanCompatibleAI/imitation/blob/master/examples/4_train_airl.ipynb
with a for loop for the training steps it creates episodes of different horizons

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions