深度增强学习--DPPO
1 """ 2 A simple version of Proximal Policy Optimization (PPO) using single thread. 3 4 Based on: 5 1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286] 6 2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347] 7 8 View more on my tutorial website: https://morvanzhou.github.io/tutorials 9 10 Dependencies: 11 tensorflow r1.2 12 gym 0.9.2 13 """ 14 15 import tensorflow as tf 16 import numpy as np 17 import matplotlib.pyplot as plt 18 import gym 19 20 EP_MAX = 1000 21 EP_LEN = 200 22 GAMMA = 0.9 23 A_LR = 0.0001 24 C_LR = 0.0002 25 BATCH = 32 26 A_UPDATE_STEPS = 10 27 C_UPDATE_STEPS = 10 28 S_DIM, A_DIM = 3, 1 29 METHOD = [ 30 dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty 31 dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better 32 ][1] # choose the method for optimization 33 34 35 class PPO(object): 36 37 def __init__(self): 38 self.sess = tf.Session() 39 self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') 40 41 # critic 42 with tf.variable_scope('critic'): 43 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu) 44 self.v = tf.layers.dense(l1, 1) 45 self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') 46 self.advantage = self.tfdc_r - self.v 47 self.closs = tf.reduce_mean(tf.square(self.advantage)) 48 self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs) 49 50 # actor 51 pi, pi_params = self._build_anet('pi', trainable=True) 52 oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) 53 with tf.variable_scope('sample_action'): 54 self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action 55 with tf.variable_scope('update_oldpi'): 56 self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] 57 58 self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action') 59 self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') 60 with tf.variable_scope('loss'): 61 with tf.variable_scope('surrogate'): 62 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) 63 ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa) 64 surr = ratio * self.tfadv 65 if METHOD['name'] == 'kl_pen': 66 self.tflam = tf.placeholder(tf.float32, None, 'lambda') 67 kl = tf.distributions.kl_divergence(oldpi, pi) 68 self.kl_mean = tf.reduce_mean(kl) 69 self.aloss = -(tf.reduce_mean(surr - self.tflam * kl)) 70 else: # clipping method, find this is better 71 self.aloss = -tf.reduce_mean(tf.minimum( 72 surr, 73 tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv)) 74 75 with tf.variable_scope('atrain'): 76 self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss) 77 78 tf.summary.FileWriter("log/", self.sess.graph) 79 80 self.sess.run(tf.global_variables_initializer()) 81 82 def update(self, s, a, r): 83 self.sess.run(self.update_oldpi_op) 84 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r}) 85 # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful 86 87 # update actor 88 if METHOD['name'] == 'kl_pen': 89 for _ in range(A_UPDATE_STEPS): 90 _, kl = self.sess.run( 91 [self.atrain_op, self.kl_mean], 92 {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']}) 93 if kl > 4*METHOD['kl_target']: # this in in google's paper 94 break 95 if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper 96 METHOD['lam'] /= 2 97 elif kl > METHOD['kl_target'] * 1.5: 98 METHOD['lam'] *= 2 99 METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution 100 else: # clipping method, find this is better (OpenAI's paper) 101 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)] 102 103 # update critic 104 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)] 105 106 def _build_anet(self, name, trainable): 107 with tf.variable_scope(name): 108 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable) 109 mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable) 110 sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable) 111 norm_dist = tf.distributions.Normal(loc=mu, scale=sigma) 112 params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) 113 return norm_dist, params 114 115 def choose_action(self, s): 116 s = s[np.newaxis, :] 117 a = self.sess.run(self.sample_op, {self.tfs: s})[0] 118 return np.clip(a, -2, 2) 119 120 def get_v(self, s): 121 if s.ndim < 2: s = s[np.newaxis, :] 122 return self.sess.run(self.v, {self.tfs: s})[0, 0] 123 124 env = gym.make('Pendulum-v0').unwrapped 125 ppo = PPO() 126 all_ep_r = [] 127 128 for ep in range(EP_MAX): 129 s = env.reset() 130 buffer_s, buffer_a, buffer_r = [], [], [] 131 ep_r = 0 132 for t in range(EP_LEN): # in one episode 133 env.render() 134 a = ppo.choose_action(s) 135 s_, r, done, _ = env.step(a) 136 buffer_s.append(s) 137 buffer_a.append(a) 138 buffer_r.append((r+8)/8) # normalize reward, find to be useful 139 s = s_ 140 ep_r += r 141 142 # update ppo 143 if (t+1) % BATCH == 0 or t == EP_LEN-1: 144 v_s_ = ppo.get_v(s_) 145 discounted_r = [] 146 for r in buffer_r[::-1]: 147 v_s_ = r + GAMMA * v_s_ 148 discounted_r.append(v_s_) 149 discounted_r.reverse() 150 151 bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] 152 buffer_s, buffer_a, buffer_r = [], [], [] 153 ppo.update(bs, ba, br) 154 if ep == 0: all_ep_r.append(ep_r) 155 else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1) 156 print( 157 'Ep: %i' % ep, 158 "|Ep_r: %i" % ep_r, 159 ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '', 160 ) 161 162 plt.plot(np.arange(len(all_ep_r)), all_ep_r) 163 plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()
1 """ 2 A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347] 3 4 Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data. 5 Restart workers once PPO is updated. 6 7 The global PPO updating rule is adopted from DeepMind's paper (DPPO): 8 Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286] 9 10 View more on my tutorial website: https://morvanzhou.github.io/tutorials 11 12 Dependencies: 13 tensorflow r1.3 14 gym 0.9.2 15 """ 16 17 import tensorflow as tf 18 import numpy as np 19 import matplotlib.pyplot as plt 20 import gym, threading, queue 21 22 EP_MAX = 1000 23 EP_LEN = 200 24 N_WORKER = 4 # parallel workers 25 GAMMA = 0.9 # reward discount factor 26 A_LR = 0.0001 # learning rate for actor 27 C_LR = 0.0002 # learning rate for critic 28 MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO 29 UPDATE_STEP = 10 # loop update operation n-steps 30 EPSILON = 0.2 # for clipping surrogate objective 31 GAME = 'Pendulum-v0' 32 S_DIM, A_DIM = 3, 1 # state and action dimension 33 34 35 class PPO(object): 36 def __init__(self): 37 self.sess = tf.Session() 38 self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') 39 40 # critic 41 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu) 42 self.v = tf.layers.dense(l1, 1) 43 self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') 44 self.advantage = self.tfdc_r - self.v 45 self.closs = tf.reduce_mean(tf.square(self.advantage)) 46 self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs) 47 48 # actor 49 pi, pi_params = self._build_anet('pi', trainable=True) 50 oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) 51 self.sample_op = tf.squeeze(pi.sample(1), axis=0) # operation of choosing action 52 self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] 53 54 self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action') 55 self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') 56 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) 57 ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) 58 surr = ratio * self.tfadv # surrogate loss 59 60 self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective 61 surr, 62 tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv)) 63 64 self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss) 65 self.sess.run(tf.global_variables_initializer()) 66 67 def update(self): 68 global GLOBAL_UPDATE_COUNTER 69 while not COORD.should_stop(): 70 if GLOBAL_EP < EP_MAX: 71 UPDATE_EVENT.wait() # wait until get batch of data 72 self.sess.run(self.update_oldpi_op) # copy pi to old pi 73 data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers 74 data = np.vstack(data) 75 s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:] 76 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r}) 77 # update actor and critic in a update loop 78 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)] 79 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)] 80 UPDATE_EVENT.clear() # updating finished 81 GLOBAL_UPDATE_COUNTER = 0 # reset counter 82 ROLLING_EVENT.set() # set roll-out available 83 84 def _build_anet(self, name, trainable): 85 with tf.variable_scope(name): 86 l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable) 87 mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable) 88 sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable) 89 norm_dist = tf.distributions.Normal(loc=mu, scale=sigma) 90 params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) 91 return norm_dist, params 92 93 def choose_action(self, s): 94 s = s[np.newaxis, :] 95 a = self.sess.run(self.sample_op, {self.tfs: s})[0] 96 return np.clip(a, -2, 2) 97 98 def get_v(self, s): 99 if s.ndim < 2: s = s[np.newaxis, :] 100 return self.sess.run(self.v, {self.tfs: s})[0, 0] 101 102 103 class Worker(object): 104 def __init__(self, wid): 105 self.wid = wid 106 self.env = gym.make(GAME).unwrapped 107 self.ppo = GLOBAL_PPO 108 109 def work(self): 110 global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER 111 while not COORD.should_stop(): 112 s = self.env.reset() 113 ep_r = 0 114 buffer_s, buffer_a, buffer_r = [], [], [] 115 for t in range(EP_LEN): 116 if not ROLLING_EVENT.is_set(): # while global PPO is updating 117 ROLLING_EVENT.wait() # wait until PPO is updated 118 buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data 119 a = self.ppo.choose_action(s) 120 s_, r, done, _ = self.env.step(a) 121 buffer_s.append(s) 122 buffer_a.append(a) 123 buffer_r.append((r + 8) / 8) # normalize reward, find to be useful 124 s = s_ 125 ep_r += r 126 127 GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers 128 if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: 129 v_s_ = self.ppo.get_v(s_) 130 discounted_r = [] # compute discounted reward 131 for r in buffer_r[::-1]: 132 v_s_ = r + GAMMA * v_s_ 133 discounted_r.append(v_s_) 134 discounted_r.reverse() 135 136 bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] 137 buffer_s, buffer_a, buffer_r = [], [], [] 138 QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue 139 if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: 140 ROLLING_EVENT.clear() # stop collecting data 141 UPDATE_EVENT.set() # globalPPO update 142 143 if GLOBAL_EP >= EP_MAX: # stop training 144 COORD.request_stop() 145 break 146 147 # record reward changes, plot later 148 if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) 149 else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1) 150 GLOBAL_EP += 1 151 print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,) 152 153 154 if __name__ == '__main__': 155 GLOBAL_PPO = PPO() 156 UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() 157 UPDATE_EVENT.clear() # not update now 158 ROLLING_EVENT.set() # start to roll out 159 workers = [Worker(wid=i) for i in range(N_WORKER)] 160 161 GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 162 GLOBAL_RUNNING_R = [] 163 COORD = tf.train.Coordinator() 164 QUEUE = queue.Queue() # workers putting data in this queue 165 threads = [] 166 for worker in workers: # worker threads 167 t = threading.Thread(target=worker.work, args=()) 168 t.start() # training 169 threads.append(t) 170 # add a PPO updating thread 171 threads.append(threading.Thread(target=GLOBAL_PPO.update,)) 172 threads[-1].start() 173 COORD.join(threads) 174 175 # plot reward change and test 176 plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) 177 plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show() 178 env = gym.make('Pendulum-v0') 179 while True: 180 s = env.reset() 181 for t in range(300): 182 env.render() 183 s = env.step(GLOBAL_PPO.choose_action(s))[0]
桔桔桔桔桔桔桔桔桔桔