Skip to content

Commit

Permalink
Initial commit. First policy extraction on viper works.
Browse files Browse the repository at this point in the history
  • Loading branch information
Christoph Pröschel committed Jan 12, 2023
0 parents commit 15ec804
Show file tree
Hide file tree
Showing 18 changed files with 640 additions and 0 deletions.
162 changes: 162 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

log/*
Binary file added 1805.08328.pdf
Binary file not shown.
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Viper

Read the accompanying blog post here (tbd).

**V**erifiability via **I**terative **P**olicy **E**xt**R**action (2019) [paper](https://arxiv.org/abs/1805.08328)]

In this paper the authors distill a Deep Reinforcement Learning such as DeepQN into a decision tree policy which can then be automatically checked for correctness, robustness, and stability.

This repository implements and tests the viper algorithm on the following environments:

- CartPole
- Atari Pong
- ToyPong (tbd)

## Usage

### Training the oracle

Atari Pong:

```
python main.py train-oracle --env-name PongNoFrameskip-v4 --n-env 8 --total-timesteps 10_000_000
```

Cart pole:

```
python main.py train-oracle --env-name CartPole-v1 --n-env 8 --total-timesteps 100_000
```


### Running viper

Once the oracle policies are trained you can run viper on the same environment:

```
python main.py train-viper --env-name CartPole-v1 --n-env 1
```
34 changes: 34 additions & 0 deletions gym_env/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import gym
import numpy as np
from gym import register
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

register(
id='ToyPong-v0',
entry_point='gym_env.toy_pong:ToyPong',
kwargs={'args': None}
)

register(
id='WrappedPong-v0',
entry_point='gym_env.atari_pong:AtariPong',
kwargs={'args': None}
)


def make_env(args):
if args.env_name == "PongNoFrameskip-v4":
env = make_atari_env(args.env_name, n_envs=args.n_env)
env = VecFrameStack(env, n_stack=4)
return env
if args.env_name == "CartPole-v1":
return DummyVecEnv([lambda: gym.make(args.env_name) for _ in range(args.n_env)])
return gym.make(args.env_name)


def is_done(done):
if type(done) is np.ndarray:
return done.all()
else:
return done
107 changes: 107 additions & 0 deletions gym_env/atari_pong.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import collections
import math

import numpy as np
import gym

# The default ALE environment returns only the picture of the game screen,
# however the extracted policy requires a state vector (x, y, y', y'', ball_x, ball_y)
# that is extracted from the image, but it is not clear how.
# We therefore extract everything we can from the ALE ram and estimate the missing quantities.
# See https://github.com/mila-iqia/atari-representation-learning/blob/master/atariari/benchmark/ram_annotations.py#L178
class AtariPong(gym.Env):
def __init__(self, args):
self.env = gym.make("Pong-v4", obs_type="ram", render_mode="human")

self._last_states = collections.deque(maxlen=10)
self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(8,), dtype=np.float32)
self.action_space = self.env.action_space

def reset(self):
self._last_states.clear()
ram = self.env.reset()
return self._ram_to_obs(ram)

def _ram_to_obs(self, ram):
player_paddle_y = ram[51].astype(np.float32) # Y coordinate of your paddle
ball_x = ram[49].astype(np.float32) # X coordinate of ball
ball_y = ram[54].astype(np.float32) # Y coordinate of ball

speed = self._estimate_speed()[-1]
acc = self._estimate_acceleration()[-1]
jerk = self._estimate_jerk()[-1]
ball_speed = self._estimate_ball_velocity()[-1]

state = [player_paddle_y, speed, acc, jerk, ball_x, ball_y, ball_speed[0], ball_speed[1]]

return state

def step(self, action):
ram, reward, done, info = self.env.step(action)

next_state = self._ram_to_obs(ram)
self._last_states.append(next_state)

return next_state, reward, done, info

def render(self, mode="human"):
self.env.render(mode="rgb_array")

def _estimate_speed(self):
if len(self._last_states) < 2:
return [0]

last = None
speeds = []
for state in self._last_states:
paddle_y = state[0]
if last is None:
last = paddle_y
continue

speeds.append(paddle_y - last)
last = paddle_y

return speeds

def _estimate_acceleration(self):
speeds = self._estimate_speed()
if len(speeds) < 2:
return [0]

last = None
accs = []
for speed in speeds:
if last is None:
last = speed
continue

accs.append(speed - last)
last = speed

return accs

def _estimate_jerk(self):
accs = self._estimate_acceleration()
if len(accs) < 2:
return [0]

return [accs[-1] - accs[-2]]

def _estimate_ball_velocity(self):
if len(self._last_states) < 2:
return [[0, 0]]

last = None
speeds = []
for state in self._last_states:
ball_x = state[4]
ball_y = state[5]
if last is None:
last = [ball_x, ball_y]
continue

speeds.append([ball_x - last[0], ball_y - last[1]])
last = [ball_x, ball_y]

return speeds
10 changes: 10 additions & 0 deletions gym_env/toy_pong.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import gym
import numpy as np
from gym.spaces import Box

# https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html#tips-and-tricks-when-creating-a-custom-environment
class ToyPong(gym.Env):
n_agent = 1

def __init__(self, args):
self.args = args
Loading

0 comments on commit 15ec804

Please sign in to comment.