-
Notifications
You must be signed in to change notification settings - Fork 2
/
agent_kuhn.py
106 lines (86 loc) · 3.53 KB
/
agent_kuhn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
This module implements several agents. An agent is characterized by two methods:
* act : implements the policy, i.e., it returns agent's decisions to interact in a MDP or Markov Game.
* update : the learning mechanism of the agent.
"""
import numpy as np
from numpy.random import choice
from engine import RMG
class Agent():
"""
Parent abstract Agent.
"""
def __init__(self, action_space):
self.action_space = action_space
def act(self, obs):
"""
This implements the policy, \pi : S -> A.
obs is the observed state s
"""
raise NotImplementedError()
def update(self, obs, actions, rewards, new_obs):
"""
This is after an interaction has ocurred, ie all agents have done their respective actions, observed their rewards and arrived at
a new observation (state).
For example, this is were a Q-learning agent would update her Q-function
"""
pass
############################ KUHN POKER ########################################
class kuhnAgent2(Agent):
"""
Stationary second agent in kuhn poker game. His action is parametrized using
two parameters and depends on his card and the previous movement of his opponent.
Cards: J=0, Q=1, K=2.
Actions: pass=0, bet=1.
"""
def __init__(self, action_space, zeta, eta):
Agent.__init__(self, action_space)
self.zeta = zeta
self.eta = eta
def act(self, card, enemy_action):
if card == 2:
return 1
if card == 0:
if enemy_action == 0:
return 1 if np.random.rand() < self.zeta else 0
else:
return 0
else:
if enemy_action == 0:
return 0
else:
return 1 if np.random.rand() < self.eta else 0
class FPLearningAgent(Agent):
"""
A Q-learning agent that treats the other player as a level 0 agent.
She learns from other's actions in a bayesian way.
She represents Q-values in a tabular fashion, i.e., using a matrix Q.
"""
def __init__(self, action_space, enemy_action_space, n_states, learning_rate, epsilon, gamma):
Agent.__init__(self, action_space)
self.n_states = n_states
self.alpha = learning_rate
self.epsilon = epsilon
self.gamma = gamma
self.enemy_action_space = enemy_action_space
# This is the Q-function Q(s, a, b)
self.Q = np.zeros([self.n_states, len(self.action_space), len(self.enemy_action_space)])
# Parameters of the Dirichlet distribution used to model the other agent, conditioned
# Initialized using a uniform prior
self.Dir = np.ones( [self.n_states, len(self.enemy_action_space)] )
def act(self, obs=None):
"""An epsilon-greedy policy"""
if np.random.rand() < self.epsilon:
return self.action_space[choice(np.arange(len(self.action_space)))]
else:
return self.action_space[ np.argmax( np.dot( self.Q[obs[0],:,:],
self.Dir[obs[0],:]/np.sum(self.Dir[obs[0],:]) ) ) ]
def update(self, obs, actions, rewards, new_obs):
"""The vanilla Q-learning update rule"""
a0, a1 = actions
r0, _ = rewards
self.Dir[obs[0],a1] += 1 # Update beliefs about adversary
aux = np.max( np.dot( self.Q[new_obs[0],:,:],
self.Dir[new_obs[0],:]/np.sum(self.Dir[new_obs[0],:]) ) )
self.Q[obs[0], a0, a1] = ( (1 - self.alpha)*self.Q[obs[0], a0, a1] +
self.alpha*(r0 + self.gamma*aux) )