深度增强学习--DPPO

PPO

DPPO介绍

PPO实现

  1 """
  2 A simple version of Proximal Policy Optimization (PPO) using single thread.
  3 
  4 Based on:
  5 1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
  6 2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347]
  7 
  8 View more on my tutorial website: https://morvanzhou.github.io/tutorials
  9 
 10 Dependencies:
 11 tensorflow r1.2
 12 gym 0.9.2
 13 """
 14 
 15 import tensorflow as tf
 16 import numpy as np
 17 import matplotlib.pyplot as plt
 18 import gym
 19 
 20 EP_MAX = 1000
 21 EP_LEN = 200
 22 GAMMA = 0.9
 23 A_LR = 0.0001
 24 C_LR = 0.0002
 25 BATCH = 32
 26 A_UPDATE_STEPS = 10
 27 C_UPDATE_STEPS = 10
 28 S_DIM, A_DIM = 3, 1
 29 METHOD = [
 30     dict(name='kl_pen', kl_target=0.01, lam=0.5),   # KL penalty
 31     dict(name='clip', epsilon=0.2),                 # Clipped surrogate objective, find this is better
 32 ][1]        # choose the method for optimization
 33 
 34 
 35 class PPO(object):
 36 
 37     def __init__(self):
 38         self.sess = tf.Session()
 39         self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
 40 
 41         # critic
 42         with tf.variable_scope('critic'):
 43             l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
 44             self.v = tf.layers.dense(l1, 1)
 45             self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
 46             self.advantage = self.tfdc_r - self.v
 47             self.closs = tf.reduce_mean(tf.square(self.advantage))
 48             self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
 49 
 50         # actor
 51         pi, pi_params = self._build_anet('pi', trainable=True)
 52         oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
 53         with tf.variable_scope('sample_action'):
 54             self.sample_op = tf.squeeze(pi.sample(1), axis=0)       # choosing action
 55         with tf.variable_scope('update_oldpi'):
 56             self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
 57 
 58         self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
 59         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
 60         with tf.variable_scope('loss'):
 61             with tf.variable_scope('surrogate'):
 62                 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
 63                 ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
 64                 surr = ratio * self.tfadv
 65             if METHOD['name'] == 'kl_pen':
 66                 self.tflam = tf.placeholder(tf.float32, None, 'lambda')
 67                 kl = tf.distributions.kl_divergence(oldpi, pi)
 68                 self.kl_mean = tf.reduce_mean(kl)
 69                 self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
 70             else:   # clipping method, find this is better
 71                 self.aloss = -tf.reduce_mean(tf.minimum(
 72                     surr,
 73                     tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
 74 
 75         with tf.variable_scope('atrain'):
 76             self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
 77 
 78         tf.summary.FileWriter("log/", self.sess.graph)
 79 
 80         self.sess.run(tf.global_variables_initializer())
 81 
 82     def update(self, s, a, r):
 83         self.sess.run(self.update_oldpi_op)
 84         adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
 85         # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
 86 
 87         # update actor
 88         if METHOD['name'] == 'kl_pen':
 89             for _ in range(A_UPDATE_STEPS):
 90                 _, kl = self.sess.run(
 91                     [self.atrain_op, self.kl_mean],
 92                     {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
 93                 if kl > 4*METHOD['kl_target']:  # this in in google's paper
 94                     break
 95             if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
 96                 METHOD['lam'] /= 2
 97             elif kl > METHOD['kl_target'] * 1.5:
 98                 METHOD['lam'] *= 2
 99             METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)    # sometimes explode, this clipping is my solution
100         else:   # clipping method, find this is better (OpenAI's paper)
101             [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
102 
103         # update critic
104         [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
105 
106     def _build_anet(self, name, trainable):
107         with tf.variable_scope(name):
108             l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
109             mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
110             sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
111             norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
112         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
113         return norm_dist, params
114 
115     def choose_action(self, s):
116         s = s[np.newaxis, :]
117         a = self.sess.run(self.sample_op, {self.tfs: s})[0]
118         return np.clip(a, -2, 2)
119 
120     def get_v(self, s):
121         if s.ndim < 2: s = s[np.newaxis, :]
122         return self.sess.run(self.v, {self.tfs: s})[0, 0]
123 
124 env = gym.make('Pendulum-v0').unwrapped
125 ppo = PPO()
126 all_ep_r = []
127 
128 for ep in range(EP_MAX):
129     s = env.reset()
130     buffer_s, buffer_a, buffer_r = [], [], []
131     ep_r = 0
132     for t in range(EP_LEN):    # in one episode
133         env.render()
134         a = ppo.choose_action(s)
135         s_, r, done, _ = env.step(a)
136         buffer_s.append(s)
137         buffer_a.append(a)
138         buffer_r.append((r+8)/8)    # normalize reward, find to be useful
139         s = s_
140         ep_r += r
141 
142         # update ppo
143         if (t+1) % BATCH == 0 or t == EP_LEN-1:
144             v_s_ = ppo.get_v(s_)
145             discounted_r = []
146             for r in buffer_r[::-1]:
147                 v_s_ = r + GAMMA * v_s_
148                 discounted_r.append(v_s_)
149             discounted_r.reverse()
150 
151             bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
152             buffer_s, buffer_a, buffer_r = [], [], []
153             ppo.update(bs, ba, br)
154     if ep == 0: all_ep_r.append(ep_r)
155     else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
156     print(
157         'Ep: %i' % ep,
158         "|Ep_r: %i" % ep_r,
159         ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '',
160     )
161 
162 plt.plot(np.arange(len(all_ep_r)), all_ep_r)
163 plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()

 

代码DPPO

  1 """
  2 A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
  3 
  4 Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
  5 Restart workers once PPO is updated.
  6 
  7 The global PPO updating rule is adopted from DeepMind's paper (DPPO):
  8 Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
  9 
 10 View more on my tutorial website: https://morvanzhou.github.io/tutorials
 11 
 12 Dependencies:
 13 tensorflow r1.3
 14 gym 0.9.2
 15 """
 16 
 17 import tensorflow as tf
 18 import numpy as np
 19 import matplotlib.pyplot as plt
 20 import gym, threading, queue
 21 
 22 EP_MAX = 1000
 23 EP_LEN = 200
 24 N_WORKER = 4                # parallel workers
 25 GAMMA = 0.9                 # reward discount factor
 26 A_LR = 0.0001               # learning rate for actor
 27 C_LR = 0.0002               # learning rate for critic
 28 MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
 29 UPDATE_STEP = 10            # loop update operation n-steps
 30 EPSILON = 0.2               # for clipping surrogate objective
 31 GAME = 'Pendulum-v0'
 32 S_DIM, A_DIM = 3, 1         # state and action dimension
 33 
 34 
 35 class PPO(object):
 36     def __init__(self):
 37         self.sess = tf.Session()
 38         self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
 39 
 40         # critic
 41         l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
 42         self.v = tf.layers.dense(l1, 1)
 43         self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
 44         self.advantage = self.tfdc_r - self.v
 45         self.closs = tf.reduce_mean(tf.square(self.advantage))
 46         self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
 47 
 48         # actor
 49         pi, pi_params = self._build_anet('pi', trainable=True)
 50         oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
 51         self.sample_op = tf.squeeze(pi.sample(1), axis=0)  # operation of choosing action
 52         self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
 53 
 54         self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
 55         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
 56         # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
 57         ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
 58         surr = ratio * self.tfadv                       # surrogate loss
 59 
 60         self.aloss = -tf.reduce_mean(tf.minimum(        # clipped surrogate objective
 61             surr,
 62             tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
 63 
 64         self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
 65         self.sess.run(tf.global_variables_initializer())
 66 
 67     def update(self):
 68         global GLOBAL_UPDATE_COUNTER
 69         while not COORD.should_stop():
 70             if GLOBAL_EP < EP_MAX:
 71                 UPDATE_EVENT.wait()                     # wait until get batch of data
 72                 self.sess.run(self.update_oldpi_op)     # copy pi to old pi
 73                 data = [QUEUE.get() for _ in range(QUEUE.qsize())]      # collect data from all workers
 74                 data = np.vstack(data)
 75                 s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
 76                 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
 77                 # update actor and critic in a update loop
 78                 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
 79                 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
 80                 UPDATE_EVENT.clear()        # updating finished
 81                 GLOBAL_UPDATE_COUNTER = 0   # reset counter
 82                 ROLLING_EVENT.set()         # set roll-out available
 83 
 84     def _build_anet(self, name, trainable):
 85         with tf.variable_scope(name):
 86             l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
 87             mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
 88             sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
 89             norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
 90         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
 91         return norm_dist, params
 92 
 93     def choose_action(self, s):
 94         s = s[np.newaxis, :]
 95         a = self.sess.run(self.sample_op, {self.tfs: s})[0]
 96         return np.clip(a, -2, 2)
 97 
 98     def get_v(self, s):
 99         if s.ndim < 2: s = s[np.newaxis, :]
100         return self.sess.run(self.v, {self.tfs: s})[0, 0]
101 
102 
103 class Worker(object):
104     def __init__(self, wid):
105         self.wid = wid
106         self.env = gym.make(GAME).unwrapped
107         self.ppo = GLOBAL_PPO
108 
109     def work(self):
110         global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
111         while not COORD.should_stop():
112             s = self.env.reset()
113             ep_r = 0
114             buffer_s, buffer_a, buffer_r = [], [], []
115             for t in range(EP_LEN):
116                 if not ROLLING_EVENT.is_set():                  # while global PPO is updating
117                     ROLLING_EVENT.wait()                        # wait until PPO is updated
118                     buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
119                 a = self.ppo.choose_action(s)
120                 s_, r, done, _ = self.env.step(a)
121                 buffer_s.append(s)
122                 buffer_a.append(a)
123                 buffer_r.append((r + 8) / 8)                    # normalize reward, find to be useful
124                 s = s_
125                 ep_r += r
126 
127                 GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size, no need to wait other workers
128                 if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
129                     v_s_ = self.ppo.get_v(s_)
130                     discounted_r = []                           # compute discounted reward
131                     for r in buffer_r[::-1]:
132                         v_s_ = r + GAMMA * v_s_
133                         discounted_r.append(v_s_)
134                     discounted_r.reverse()
135 
136                     bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
137                     buffer_s, buffer_a, buffer_r = [], [], []
138                     QUEUE.put(np.hstack((bs, ba, br)))          # put data in the queue
139                     if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
140                         ROLLING_EVENT.clear()       # stop collecting data
141                         UPDATE_EVENT.set()          # globalPPO update
142 
143                     if GLOBAL_EP >= EP_MAX:         # stop training
144                         COORD.request_stop()
145                         break
146 
147             # record reward changes, plot later
148             if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
149             else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
150             GLOBAL_EP += 1
151             print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
152 
153 
154 if __name__ == '__main__':
155     GLOBAL_PPO = PPO()
156     UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
157     UPDATE_EVENT.clear()            # not update now
158     ROLLING_EVENT.set()             # start to roll out
159     workers = [Worker(wid=i) for i in range(N_WORKER)]
160     
161     GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
162     GLOBAL_RUNNING_R = []
163     COORD = tf.train.Coordinator()
164     QUEUE = queue.Queue()           # workers putting data in this queue
165     threads = []
166     for worker in workers:          # worker threads
167         t = threading.Thread(target=worker.work, args=())
168         t.start()                   # training
169         threads.append(t)
170     # add a PPO updating thread
171     threads.append(threading.Thread(target=GLOBAL_PPO.update,))
172     threads[-1].start()
173     COORD.join(threads)
174 
175     # plot reward change and test
176     plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
177     plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
178     env = gym.make('Pendulum-v0')
179     while True:
180         s = env.reset()
181         for t in range(300):
182             env.render()
183             s = env.step(GLOBAL_PPO.choose_action(s))[0]

 

posted @ 2019-01-10 19:37  阿夏z  阅读(1400)  评论(0编辑  收藏  举报