-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQLearningV2.py
65 lines (57 loc) · 2.1 KB
/
QLearningV2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import random
if __name__ == '__main__':
print("Q Learning Start")
# print( np.argmax(Q[0,:]) )
# Init statte - action
reward_state_action=np.array([[0,0,0,0,1,0],
[0,0,0,1,0,100],
[0,0,0,1,0,0],
[0,1,1,0,1,0],
[1,0,0,1,0,100],
[0,1,0,0,1,100]
])
Q_value=np.array([[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0]
])
goal_state = 5
epsilon =0.5
decay_espsilon=0.95
# Init Q table
gamma = 0.75 # Discount factor
alpha = 0.95 # Learning rate
numState=6
numAction=6
R=reward_state_action
# Outer loop
for n in range(1,1000):
init_state = random.randint(0,numState-1)
# Model
state_current=init_state
epsilon *= decay_espsilon
while(1):
# innner loop
playable_actions = []
# Iterate through the new rewards matrix and get the actions > 0
for j in range(numAction):
if reward_state_action[state_current,j] > 0:
playable_actions.append(j)
if np.random.random() < epsilon or np.sum(Q_value[state_current, :]) == 0:
action = np.random.choice(playable_actions)
else:
action = np.argmax(Q_value[state_current, :])
# state_next = model(action)
state_next= action
# Q-Learning
TD = R[state_current,action]+gamma*max( Q_value[state_next,:] )-Q_value[state_current,action]
Q_value[state_current,action]=Q_value[state_current,action]+alpha*TD
# Update Q value table
state_current = state_next
if state_current == goal_state:
break
print("Q Learning End")
print(Q_value)