55from unityagents import UnityEnvironmentException
66
77
8- def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 , max_step = 5e6 ):
8+ def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 , max_step = 5e6 , normalize = False , num_layers = 2 ):
99 """
1010 Takes a Unity environment and model-specific hyper-parameters and returns the
1111 appropriate PPO agent model for the environment.
@@ -17,12 +17,14 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_ste
1717 :return: a sub-class of PPOAgent tailored to the environment.
1818 :param max_step: Total number of training steps.
1919 """
20+ if num_layers < 1 : num_layers = 1
21+
2022 brain_name = env .brain_names [0 ]
2123 brain = env .brains [brain_name ]
2224 if brain .action_space_type == "continuous" :
23- return ContinuousControlModel (lr , brain , h_size , epsilon , max_step )
25+ return ContinuousControlModel (lr , brain , h_size , epsilon , max_step , normalize , num_layers )
2426 if brain .action_space_type == "discrete" :
25- return DiscreteControlModel (lr , brain , h_size , epsilon , beta , max_step )
27+ return DiscreteControlModel (lr , brain , h_size , epsilon , beta , max_step , normalize , num_layers )
2628
2729
2830def save_model (sess , saver , model_path = "./" , steps = 0 ):
@@ -57,6 +59,9 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
5759
5860
5961class PPOModel (object ):
62+ def __init__ (self ):
63+ self .normalize = False
64+
6065 def create_global_steps (self ):
6166 """Creates TF ops to track and increment global training step."""
6267 self .global_step = tf .Variable (0 , name = "global_step" , trainable = False , dtype = tf .int32 )
@@ -68,7 +73,7 @@ def create_reward_encoder(self):
6873 self .new_reward = tf .placeholder (shape = [], dtype = tf .float32 , name = 'new_reward' )
6974 self .update_reward = tf .assign (self .last_reward , self .new_reward )
7075
71- def create_visual_encoder (self , o_size_h , o_size_w , bw , h_size , num_streams , activation ):
76+ def create_visual_encoder (self , o_size_h , o_size_w , bw , h_size , num_streams , activation , num_layers ):
7277 """
7378 Builds a set of visual (CNN) encoders.
7479 :param o_size_h: Height observation size.
@@ -92,11 +97,13 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
9297 use_bias = False , activation = activation )
9398 self .conv2 = tf .layers .conv2d (self .conv1 , 32 , kernel_size = [4 , 4 ], strides = [2 , 2 ],
9499 use_bias = False , activation = activation )
95- hidden = tf .layers .dense (c_layers .flatten (self .conv2 ), h_size , use_bias = False , activation = activation )
100+ hidden = c_layers .flatten (self .conv2 )
101+ for j in range (num_layers ):
102+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
96103 streams .append (hidden )
97104 return streams
98105
99- def create_continuous_state_encoder (self , s_size , h_size , num_streams , activation ):
106+ def create_continuous_state_encoder (self , s_size , h_size , num_streams , activation , num_layers ):
100107 """
101108 Builds a set of hidden state encoders.
102109 :param s_size: state input size.
@@ -107,27 +114,30 @@ def create_continuous_state_encoder(self, s_size, h_size, num_streams, activatio
107114 """
108115 self .state_in = tf .placeholder (shape = [None , s_size ], dtype = tf .float32 , name = 'state' )
109116
110- self .running_mean = tf .get_variable ("running_mean" , [s_size ], trainable = False , dtype = tf .float32 ,
111- initializer = tf .zeros_initializer ())
112- self .running_variance = tf .get_variable ("running_variance" , [s_size ], trainable = False , dtype = tf .float32 ,
113- initializer = tf .ones_initializer ())
114-
115- self .normalized_state = tf .clip_by_value ((self .state_in - self .running_mean ) / tf .sqrt (
116- self .running_variance / (tf .cast (self .global_step , tf .float32 ) + 1 )), - 5 , 5 , name = "normalized_state" )
117+ if self .normalize :
118+ self .running_mean = tf .get_variable ("running_mean" , [s_size ], trainable = False , dtype = tf .float32 ,
119+ initializer = tf .zeros_initializer ())
120+ self .running_variance = tf .get_variable ("running_variance" , [s_size ], trainable = False , dtype = tf .float32 ,
121+ initializer = tf .ones_initializer ())
117122
118- self .new_mean = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_mean' )
119- self .new_variance = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_variance' )
120- self .update_mean = tf .assign (self .running_mean , self .new_mean )
121- self .update_variance = tf .assign (self .running_variance , self .new_variance )
123+ self .normalized_state = tf .clip_by_value ((self .state_in - self .running_mean ) / tf .sqrt (
124+ self .running_variance / (tf .cast (self .global_step , tf .float32 ) + 1 )), - 5 , 5 , name = "normalized_state" )
122125
126+ self .new_mean = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_mean' )
127+ self .new_variance = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_variance' )
128+ self .update_mean = tf .assign (self .running_mean , self .new_mean )
129+ self .update_variance = tf .assign (self .running_variance , self .new_variance )
130+ else :
131+ self .normalized_state = self .state_in
123132 streams = []
124133 for i in range (num_streams ):
125- hidden_1 = tf .layers .dense (self .normalized_state , h_size , use_bias = False , activation = activation )
126- hidden_2 = tf .layers .dense (hidden_1 , h_size , use_bias = False , activation = activation )
127- streams .append (hidden_2 )
134+ hidden = self .normalized_state
135+ for j in range (num_layers ):
136+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
137+ streams .append (hidden )
128138 return streams
129139
130- def create_discrete_state_encoder (self , s_size , h_size , num_streams , activation ):
140+ def create_discrete_state_encoder (self , s_size , h_size , num_streams , activation , num_layers ):
131141 """
132142 Builds a set of hidden state encoders from discrete state input.
133143 :param s_size: state input size (discrete).
@@ -140,8 +150,10 @@ def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation)
140150 state_in = tf .reshape (self .state_in , [- 1 ])
141151 state_onehot = c_layers .one_hot_encoding (state_in , s_size )
142152 streams = []
153+ hidden = state_onehot
143154 for i in range (num_streams ):
144- hidden = tf .layers .dense (state_onehot , h_size , use_bias = False , activation = activation )
155+ for j in range (num_layers ):
156+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
145157 streams .append (hidden )
146158 return streams
147159
@@ -186,29 +198,31 @@ def create_ppo_optimizer(self, probs, old_probs, value, entropy, beta, epsilon,
186198
187199
188200class ContinuousControlModel (PPOModel ):
189- def __init__ (self , lr , brain , h_size , epsilon , max_step ):
201+ def __init__ (self , lr , brain , h_size , epsilon , max_step , normalize , num_layers ):
190202 """
191203 Creates Continuous Control Actor-Critic model.
192204 :param brain: State-space size
193205 :param h_size: Hidden layer size
194206 """
207+ super ().__init__ ()
195208 s_size = brain .state_space_size
196209 a_size = brain .action_space_size
197210
211+ self .normalize = normalize
198212 self .create_global_steps ()
199213 self .create_reward_encoder ()
200214
201215 hidden_state , hidden_visual , hidden_policy , hidden_value = None , None , None , None
202216 if brain .number_observations > 0 :
203217 height_size , width_size = brain .camera_resolutions [0 ]['height' ], brain .camera_resolutions [0 ]['width' ]
204218 bw = brain .camera_resolutions [0 ]['blackAndWhite' ]
205- hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 2 , tf .nn .tanh )
219+ hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 2 , tf .nn .tanh , num_layers )
206220 if brain .state_space_size > 0 :
207221 s_size = brain .state_space_size
208222 if brain .state_space_type == "continuous" :
209- hidden_state = self .create_continuous_state_encoder (s_size , h_size , 2 , tf .nn .tanh )
223+ hidden_state = self .create_continuous_state_encoder (s_size , h_size , 2 , tf .nn .tanh , num_layers )
210224 else :
211- hidden_state = self .create_discrete_state_encoder (s_size , h_size , 2 , tf .nn .tanh )
225+ hidden_state = self .create_discrete_state_encoder (s_size , h_size , 2 , tf .nn .tanh , num_layers )
212226
213227 if hidden_visual is None and hidden_state is None :
214228 raise Exception ("No valid network configuration possible. "
@@ -249,26 +263,28 @@ def __init__(self, lr, brain, h_size, epsilon, max_step):
249263
250264
251265class DiscreteControlModel (PPOModel ):
252- def __init__ (self , lr , brain , h_size , epsilon , beta , max_step ):
266+ def __init__ (self , lr , brain , h_size , epsilon , beta , max_step , normalize , num_layers ):
253267 """
254268 Creates Discrete Control Actor-Critic model.
255269 :param brain: State-space size
256270 :param h_size: Hidden layer size
257271 """
272+ super ().__init__ ()
258273 self .create_global_steps ()
259274 self .create_reward_encoder ()
275+ self .normalize = normalize
260276
261277 hidden_state , hidden_visual , hidden = None , None , None
262278 if brain .number_observations > 0 :
263279 height_size , width_size = brain .camera_resolutions [0 ]['height' ], brain .camera_resolutions [0 ]['width' ]
264280 bw = brain .camera_resolutions [0 ]['blackAndWhite' ]
265- hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 1 , tf .nn .elu )[0 ]
281+ hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 1 , tf .nn .elu , num_layers )[0 ]
266282 if brain .state_space_size > 0 :
267283 s_size = brain .state_space_size
268284 if brain .state_space_type == "continuous" :
269- hidden_state = self .create_continuous_state_encoder (s_size , h_size , 1 , tf .nn .elu )[0 ]
285+ hidden_state = self .create_continuous_state_encoder (s_size , h_size , 1 , tf .nn .elu , num_layers )[0 ]
270286 else :
271- hidden_state = self .create_discrete_state_encoder (s_size , h_size , 1 , tf .nn .elu )[0 ]
287+ hidden_state = self .create_discrete_state_encoder (s_size , h_size , 1 , tf .nn .elu , num_layers )[0 ]
272288
273289 if hidden_visual is None and hidden_state is None :
274290 raise Exception ("No valid network configuration possible. "
0 commit comments