1
+ import tensorflow as tf
2
+ import tensorflow .contrib .layers as layers
3
+ import numpy as np
4
+ #parameters for training
5
+ GRAD_CLIP = 1000.0
6
+ KEEP_PROB1 = 1 # was 0.5
7
+ KEEP_PROB2 = 1 # was 0.7
8
+ RNN_SIZE = 512
9
+ GOAL_REPR_SIZE = 12
10
+
11
+ #Used to initialize weights for policy and value output layers (Do we need to use that? Maybe not now)
12
+ def normalized_columns_initializer (std = 1.0 ):
13
+ def _initializer (shape , dtype = None , partition_info = None ):
14
+ out = np .random .randn (* shape ).astype (np .float32 )
15
+ out *= std / np .sqrt (np .square (out ).sum (axis = 0 , keepdims = True ))
16
+ return tf .constant (out )
17
+ return _initializer
18
+
19
+ class ACNet :
20
+ def __init__ (self , scope , a_size , trainer ,TRAINING ,GRID_SIZE ,GLOBAL_NET_SCOPE ):
21
+ with tf .variable_scope (str (scope )+ '/qvalues' ):
22
+ #The input size may require more work to fit the interface.
23
+ self .inputs = tf .placeholder (shape = [None ,4 ,GRID_SIZE ,GRID_SIZE ], dtype = tf .float32 )
24
+ self .goal_pos = tf .placeholder (shape = [None ,3 ],dtype = tf .float32 )
25
+ self .myinput = tf .transpose (self .inputs , perm = [0 ,2 ,3 ,1 ])
26
+
27
+ self .message = tf .placeholder (shape = [None ,RNN_SIZE ],dtype = tf .float32 )
28
+
29
+ self .policy , self .value , self .state_out , self .state_in , self .state_init , self .blocking , self .on_goal , self .valids , self .priority = self ._build_net (self .myinput ,self .goal_pos , self .message , RNN_SIZE , TRAINING ,a_size )
30
+ if TRAINING :
31
+ self .actions = tf .placeholder (shape = [None ], dtype = tf .int32 )
32
+ self .actions_onehot = tf .one_hot (self .actions , a_size , dtype = tf .float32 )
33
+ self .train_valid = tf .placeholder (shape = [None ,a_size ], dtype = tf .float32 )
34
+ self .target_v = tf .placeholder (tf .float32 , [None ], 'Vtarget' )
35
+ self .advantages = tf .placeholder (shape = [None ], dtype = tf .float32 )
36
+ self .target_blockings = tf .placeholder (tf .float32 , [None ])
37
+ self .target_on_goals = tf .placeholder (tf .float32 , [None ])
38
+ self .responsible_outputs = tf .reduce_sum (self .policy * self .actions_onehot , [1 ])
39
+ self .train_value = tf .placeholder (tf .float32 , [None ])
40
+ self .optimal_actions = tf .placeholder (tf .int32 ,[None ])
41
+ self .optimal_actions_onehot = tf .one_hot (self .optimal_actions , a_size , dtype = tf .float32 )
42
+ self .target_priority = tf .placeholder (tf .float32 , [None ])
43
+
44
+
45
+ # Loss Functions
46
+ self .value_loss = tf .reduce_sum (self .train_value * tf .square (self .target_v - tf .reshape (self .value , shape = [- 1 ])))
47
+ self .entropy = - tf .reduce_sum (self .policy * tf .log (tf .clip_by_value (self .policy ,1e-10 ,1.0 )))
48
+ self .policy_loss = - tf .reduce_sum (tf .log (tf .clip_by_value (self .responsible_outputs ,1e-15 ,1.0 )) * self .advantages )
49
+ self .valid_loss = - tf .reduce_sum (tf .log (tf .clip_by_value (self .valids ,1e-10 ,1.0 )) * \
50
+ self .train_valid + tf .log (tf .clip_by_value (1 - self .valids ,1e-10 ,1.0 )) * (1 - self .train_valid ))
51
+ self .blocking_loss = - tf .reduce_sum (self .target_blockings * tf .log (tf .clip_by_value (self .blocking ,1e-10 ,1.0 ))\
52
+ + (1 - self .target_blockings )* tf .log (tf .clip_by_value (1 - self .blocking ,1e-10 ,1.0 )))
53
+ self .on_goal_loss = - tf .reduce_sum (self .target_on_goals * tf .log (tf .clip_by_value (self .on_goal ,1e-10 ,1.0 ))\
54
+ + (1 - self .target_on_goals )* tf .log (tf .clip_by_value (1 - self .on_goal ,1e-10 ,1.0 )))
55
+ self .loss = 0.5 * self .value_loss + self .policy_loss + 0.5 * self .valid_loss \
56
+ - self .entropy * 0.01 + .5 * self .blocking_loss
57
+
58
+ self .priority_loss = - tf .reduce_mean (self .target_priority * tf .log (tf .clip_by_value (self .priority ,1e-10 ,1.0 ))\
59
+ + (1 - self .target_priority )* tf .log (tf .clip_by_value (1 - self .priority ,1e-10 ,1.0 )))
60
+ # self.imitation_loss = tf.reduce_mean(tf.contrib.keras.backend.categorical_crossentropy(self.optimal_actions_onehot,self.policy))
61
+ self .behavior_cloning_loss = tf .reduce_mean (tf .contrib .keras .backend .categorical_crossentropy (self .optimal_actions_onehot ,self .policy ))
62
+ self .imitation_loss = self .behavior_cloning_loss + 0.5 * self .priority_loss
63
+
64
+ # Get gradients from local network using local losses and
65
+ # normalize the gradients using clipping
66
+ local_vars = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , scope + '/qvalues' )
67
+ self .gradients = tf .gradients (self .loss , local_vars )
68
+ self .var_norms = tf .global_norm (local_vars )
69
+ grads , self .grad_norms = tf .clip_by_global_norm (self .gradients , GRAD_CLIP )
70
+
71
+ # Apply local gradients to global network
72
+ global_vars = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , GLOBAL_NET_SCOPE + '/qvalues' )
73
+ self .apply_grads = trainer .apply_gradients (zip (grads , global_vars ))
74
+
75
+ #now the gradients for imitation loss
76
+ self .i_gradients = tf .gradients (self .imitation_loss , local_vars )
77
+ self .i_var_norms = tf .global_norm (local_vars )
78
+ i_grads , self .i_grad_norms = tf .clip_by_global_norm (self .i_gradients , GRAD_CLIP )
79
+
80
+ # Apply local gradients to global network
81
+ self .apply_imitation_grads = trainer .apply_gradients (zip (i_grads , global_vars ))
82
+ print ("Hello World... From " + str (scope )) # :)
83
+
84
+ def _build_net (self ,inputs ,goal_pos ,message ,RNN_SIZE ,TRAINING ,a_size ):
85
+ w_init = layers .variance_scaling_initializer ()
86
+
87
+ conv1 = layers .conv2d (inputs = inputs , padding = "SAME" , num_outputs = RNN_SIZE // 4 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
88
+ conv1a = layers .conv2d (inputs = conv1 , padding = "SAME" , num_outputs = RNN_SIZE // 4 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
89
+ conv1b = layers .conv2d (inputs = conv1a , padding = "SAME" , num_outputs = RNN_SIZE // 4 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
90
+ pool1 = layers .max_pool2d (inputs = conv1b ,kernel_size = [2 ,2 ])
91
+ conv2 = layers .conv2d (inputs = pool1 , padding = "SAME" , num_outputs = RNN_SIZE // 2 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
92
+ conv2a = layers .conv2d (inputs = conv2 , padding = "SAME" , num_outputs = RNN_SIZE // 2 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
93
+ conv2b = layers .conv2d (inputs = conv2a , padding = "SAME" , num_outputs = RNN_SIZE // 2 , kernel_size = [3 , 3 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = tf .nn .relu )
94
+ pool2 = layers .max_pool2d (inputs = conv2b ,kernel_size = [2 ,2 ])
95
+ conv3 = layers .conv2d (inputs = pool2 , padding = "VALID" , num_outputs = RNN_SIZE - GOAL_REPR_SIZE , kernel_size = [2 , 2 ], stride = 1 , data_format = "NHWC" , weights_initializer = w_init ,activation_fn = None )
96
+
97
+ flat = tf .nn .relu (layers .flatten (conv3 ))
98
+ goal_layer = layers .fully_connected (inputs = goal_pos , num_outputs = GOAL_REPR_SIZE )
99
+ hidden_input = tf .concat ([flat ,goal_layer ],1 )
100
+ h1 = layers .fully_connected (inputs = hidden_input , num_outputs = RNN_SIZE )
101
+ d1 = layers .dropout (h1 , keep_prob = KEEP_PROB1 , is_training = TRAINING )
102
+ h2 = layers .fully_connected (inputs = d1 , num_outputs = RNN_SIZE , activation_fn = None )
103
+ d2 = layers .dropout (h2 , keep_prob = KEEP_PROB2 , is_training = TRAINING )
104
+ self .h3 = tf .nn .relu (d2 + hidden_input )
105
+ #Recurrent network for temporal dependencies
106
+ lstm_cell = tf .nn .rnn_cell .BasicLSTMCell (RNN_SIZE ,state_is_tuple = True )
107
+ c_init = np .zeros ((1 , lstm_cell .state_size .c ), np .float32 )
108
+ h_init = np .zeros ((1 , lstm_cell .state_size .h ), np .float32 )
109
+ state_init = [c_init , h_init ]
110
+ c_in = tf .placeholder (tf .float32 , [1 , lstm_cell .state_size .c ])
111
+ h_in = tf .placeholder (tf .float32 , [1 , lstm_cell .state_size .h ])
112
+ state_in = (c_in , h_in )
113
+ rnn_in = tf .expand_dims (self .h3 , [0 ])
114
+ step_size = tf .shape (inputs )[:1 ]
115
+ state_in = tf .nn .rnn_cell .LSTMStateTuple (c_in , h_in )
116
+ lstm_outputs , lstm_state = tf .nn .dynamic_rnn (
117
+ lstm_cell , rnn_in , initial_state = state_in , sequence_length = step_size ,
118
+ time_major = False )
119
+ lstm_c , lstm_h = lstm_state
120
+ state_out = (lstm_c [:1 , :], lstm_h [:1 , :])
121
+ self .rnn_out = tf .reshape (lstm_outputs , [- 1 , RNN_SIZE ])
122
+
123
+ message_sig = tf .sigmoid (message )
124
+ comm_layer = layers .fully_connected (inputs = message_sig , num_outputs = RNN_SIZE ,weights_initializer = normalized_columns_initializer (1. / float (RNN_SIZE )), biases_initializer = None , activation_fn = None )
125
+ # comm_sig = tf.sigmoid(comm_layer)
126
+ # import pdb;pdb.set_trace()
127
+ comm_rnn_concat = tf .concat ([comm_layer ,self .rnn_out ],1 )
128
+ # comm_rnn_concat=tf.nn.relu(comm_layer+self.rnn_out)
129
+
130
+ policy_layer = layers .fully_connected (inputs = comm_rnn_concat , num_outputs = a_size ,weights_initializer = normalized_columns_initializer (1. / float (a_size )), biases_initializer = None , activation_fn = None )
131
+ # dp = layers.dropout(policy_layer, keep_prob=KEEP_PROB1, is_training=TRAINING)
132
+ policy = tf .nn .softmax (policy_layer )
133
+ policy_sig = tf .sigmoid (policy_layer )
134
+ value = layers .fully_connected (inputs = self .rnn_out , num_outputs = 1 , weights_initializer = normalized_columns_initializer (1.0 ), biases_initializer = None , activation_fn = None )
135
+ blocking = layers .fully_connected (inputs = self .rnn_out , num_outputs = 1 , weights_initializer = normalized_columns_initializer (1.0 ), biases_initializer = None , activation_fn = tf .sigmoid )
136
+ on_goal = layers .fully_connected (inputs = self .rnn_out , num_outputs = 1 , weights_initializer = normalized_columns_initializer (1.0 ), biases_initializer = None , activation_fn = tf .sigmoid )
137
+ priority = layers .fully_connected (inputs = self .rnn_out , num_outputs = 1 , weights_initializer = normalized_columns_initializer (1.0 ), biases_initializer = None , activation_fn = tf .sigmoid )
138
+
139
+ return policy , value , state_out ,state_in , state_init , blocking , on_goal , policy_sig , priority
0 commit comments