## 强化学习 cartpole_a3c

2017 年 7 月 21 日 CreateAMind

https://github.com/rlcode/reinforcement-learning/blob/master/2-cartpole/5-a3c/cartpole_a3c.py

import numpy as np

import tensorflow as tf

import pylab

import time

import gym

from keras.layers import Dense, Input

from keras.models import Model

from keras import backend as K

episode = 0

scores = []

EPISODES = 2000

# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole

# In this example, we use A3C algorithm

class A3CAgent:

def __init__(self, state_size, action_size, env_name):

# get size of state and action

self.state_size = state_size

self.action_size = action_size

# get gym environment name

self.env_name = env_name

# these are hyper parameters for the A3C

self.actor_lr = 0.001

self.critic_lr = 0.001

self.discount_factor = .99

self.hidden1, self.hidden2 = 24, 24

# create model for actor and critic network

self.actor, self.critic = self.build_model()

# method for training actor and critic network

self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]

self.sess = tf.InteractiveSession()

K.set_session(self.sess)

self.sess.run(tf.global_variables_initializer())

# approximate policy and value using Neural Network

# actor -> state is input and probability of each action is output of network

# critic -> state is input and value of state is output of network

# actor and critic network share first hidden layer

def build_model(self):

state = Input(batch_shape=(None,  self.state_size))

shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)

actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)

action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)

value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)

state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)

actor = Model(inputs=state, outputs=action_prob)

critic = Model(inputs=state, outputs=state_value)

actor._make_predict_function()

critic._make_predict_function()

actor.summary()

critic.summary()

return actor, critic

# make loss function for Policy Gradient

# [log(action probability) * advantages] will be input for the back prop

# we add entropy of action probability to loss

def actor_optimizer(self):

action = K.placeholder(shape=(None, self.action_size))

policy = self.actor.output

good_prob = K.sum(action * policy, axis=1)

loss = -K.sum(eligibility)

entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)

actor_loss = loss + 0.01*entropy

return train

# make loss function for Value approximation

def critic_optimizer(self):

discounted_reward = K.placeholder(shape=(None, ))

value = self.critic.output

loss = K.mean(K.square(discounted_reward - value))

return train

# make agents(local) and start training

def train(self):

agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,

self.action_size, self.state_size) for i in range(self.threads)]

for agent in agents:

agent.start()

while True:

time.sleep(20)

plot = scores[:]

pylab.plot(range(len(plot)), plot, 'b')

pylab.savefig("./save_graph/cartpole_a3c.png")

self.save_model('./save_model/cartpole_a3c.h5')

def save_model(self, name):

self.actor.save_weights(name + "_actor.h5")

self.critic.save_weights(name + "_critic.h5")

# This is Agent(local) class for threading

def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):

self.states = []

self.rewards = []

self.actions = []

self.index = index

self.actor = actor

self.critic = critic

self.optimizer = optimizer

self.env_name = env_name

self.discount_factor = discount_factor

self.action_size = action_size

self.state_size = state_size

def run(self):

global episode

env = gym.make(self.env_name)

while episode < EPISODES:

state = env.reset()

score = 0

while True:

action = self.get_action(state)

next_state, reward, done, _ = env.step(action)

score += reward

self.memory(state, action, reward)

state = next_state

if done:

episode += 1

print("episode: ", episode, "/ score : ", score)

scores.append(score)

self.train_episode(score != 500)

break

# In Policy Gradient, Q function is not available.

# Instead agent uses sample returns for evaluating policy

def discount_rewards(self, rewards, done=True):

discounted_rewards = np.zeros_like(rewards)

if not done:

for t in reversed(range(0, len(rewards))):

return discounted_rewards

# save <s, a ,r> of each step

# this is used for calculating discounted rewards

def memory(self, state, action, reward):

self.states.append(state)

act = np.zeros(self.action_size)

act[action] = 1

self.actions.append(act)

self.rewards.append(reward)

# update policy network and value network every episode

def train_episode(self, done):

discounted_rewards = self.discount_rewards(self.rewards, done)

values = self.critic.predict(np.array(self.states))

values = np.reshape(values, len(values))

self.optimizer[1]([self.states, discounted_rewards])

self.states, self.actions, self.rewards = [], [], []

def get_action(self, state):

policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]

return np.random.choice(self.action_size, 1, p=policy)[0]

if __name__ == "__main__":

env_name = 'CartPole-v1'

env = gym.make(env_name)

state_size = env.observation_space.shape[0]

action_size = env.action_space.n

env.close()

global_agent = A3CAgent(state_size, action_size, env_name)

global_agent.train()

### 相关内容

The tutorial is written for those who would like an introduction to reinforcement learning (RL). The aim is to provide an intuitive presentation of the ideas rather than concentrate on the deeper mathematics underlying the topic. RL is generally used to solve the so-called Markov decision problem (MDP). In other words, the problem that you are attempting to solve with RL should be an MDP or its variant. The theory of RL relies on dynamic programming (DP) and artificial intelligence (AI). We will begin with a quick description of MDPs. We will discuss what we mean by “complex” and “large-scale” MDPs. Then we will explain why RL is needed to solve complex and large-scale MDPs. The semi-Markov decision problem (SMDP) will also be covered.

The tutorial is meant to serve as an introduction to these topics and is based mostly on the book: “Simulation-based optimization: Parametric Optimization techniques and reinforcement learning” [4]. The book discusses this topic in greater detail in the context of simulators. There are at least two other textbooks that I would recommend you to read: (i) Neuro-dynamic programming [2] (lots of details on convergence analysis) and (ii) Reinforcement Learning: An Introduction [11] (lots of details on underlying AI concepts). A more recent tutorial on this topic is [8]. This tutorial has 2 sections: • Section 2 discusses MDPs and SMDPs. • Section 3 discusses RL. By the end of this tutorial, you should be able to • Identify problem structures that can be set up as MDPs / SMDPs. • Use some RL algorithms.

MIT新书《强化学习与最优控制》，REINFORCEMENT LEARNING AND OPTIMAL CONTROL https://web.mit.edu/dimitrib/www/Slides_Lecture13_RLOC.pdf https://web.mit.edu/dimitrib/www/RLbook.html

Deep reinforcement learning suggests the promise of fully automated learning of robotic control policies that directly map sensory inputs to low-level actions. However, applying deep reinforcement learning methods on real-world robots is exceptionally difficult, due both to the sample complexity and, just as importantly, the sensitivity of such methods to hyperparameters. While hyperparameter tuning can be performed in parallel in simulated domains, it is usually impractical to tune hyperparameters directly on real-world robotic platforms, especially legged platforms like quadrupedal robots that can be damaged through extensive trial-and-error learning. In this paper, we develop a stable variant of the soft actor-critic deep reinforcement learning algorithm that requires minimal hyperparameter tuning, while also requiring only a modest number of trials to learn multilayer neural network policies. This algorithm is based on the framework of maximum entropy reinforcement learning, and automatically trades off exploration against exploitation by dynamically and automatically tuning a temperature parameter that determines the stochasticity of the policy. We show that this method achieves state-of-the-art performance on four standard benchmark environments. We then demonstrate that it can be used to learn quadrupedal locomotion gaits on a real-world Minitaur robot, learning to walk from scratch directly in the real world in two hours of training.

To solve complex real-world problems with reinforcement learning, we cannot rely on manually specified reward functions. Instead, we can have humans communicate an objective to the agent directly. In this work, we combine two approaches to learning from human feedback: expert demonstrations and trajectory preferences. We train a deep neural network to model the reward function and use its predicted reward to train an DQN-based deep reinforcement learning agent on 9 Atari games. Our approach beats the imitation learning baseline in 7 games and achieves strictly superhuman performance on 2 games without using game rewards. Additionally, we investigate the goodness of fit of the reward model, present some reward hacking problems, and study the effects of noise in the human labels.

Policy gradient methods are often applied to reinforcement learning in continuous multiagent games. These methods perform local search in the joint-action space, and as we show, they are susceptable to a game-theoretic pathology known as relative overgeneralization. To resolve this issue, we propose Multiagent Soft Q-learning, which can be seen as the analogue of applying Q-learning to continuous controls. We compare our method to MADDPG, a state-of-the-art approach, and show that our method achieves better coordination in multiagent cooperative tasks, converging to better local optima in the joint action space.

This work considers the problem of provably optimal reinforcement learning for episodic finite horizon MDPs, i.e. how an agent learns to maximize his/her long term reward in an uncertain environment. The main contribution is in providing a novel algorithm --- Variance-reduced Upper Confidence Q-learning (vUCQ) --- which enjoys a regret bound of $\widetilde{O}(\sqrt{HSAT} + H^5SA)$, where the $T$ is the number of time steps the agent acts in the MDP, $S$ is the number of states, $A$ is the number of actions, and $H$ is the (episodic) horizon time. This is the first regret bound that is both sub-linear in the model size and asymptotically optimal. The algorithm is sub-linear in that the time to achieve $\epsilon$-average regret for any constant $\epsilon$ is $O(SA)$, which is a number of samples that is far less than that required to learn any non-trivial estimate of the transition model (the transition model is specified by $O(S^2A)$ parameters). The importance of sub-linear algorithms is largely the motivation for algorithms such as $Q$-learning and other "model free" approaches. vUCQ algorithm also enjoys minimax optimal regret in the long run, matching the $\Omega(\sqrt{HSAT})$ lower bound. Variance-reduced Upper Confidence Q-learning (vUCQ) is a successive refinement method in which the algorithm reduces the variance in $Q$-value estimates and couples this estimation scheme with an upper confidence based algorithm. Technically, the coupling of both of these techniques is what leads to the algorithm enjoying both the sub-linear regret property and the asymptotically optimal regret.

26+阅读 · 2019年10月13日
CreateAMind
4+阅读 · 2018年12月28日

7+阅读 · 2018年6月24日

6+阅读 · 2018年4月3日

8+阅读 · 2018年3月18日
CreateAMind
10+阅读 · 2017年8月2日

Tianhe Yu,Deirdre Quillen,Zhanpeng He,Ryan Julian,Karol Hausman,Chelsea Finn,Sergey Levine
22+阅读 · 2019年10月24日
4+阅读 · 2019年1月10日
Daniel S. Brown,Yuchen Cui,Scott Niekum
4+阅读 · 2019年1月8日
Tuomas Haarnoja,Aurick Zhou,Sehoon Ha,Jie Tan,George Tucker,Sergey Levine
4+阅读 · 2018年12月26日
Borja Ibarz,Jan Leike,Tobias Pohlen,Geoffrey Irving,Shane Legg,Dario Amodei
4+阅读 · 2018年11月15日
Cédric Colas,Olivier Sigaud,Pierre-Yves Oudeyer
3+阅读 · 2018年8月17日
Vinicius Zambaldi,David Raposo,Adam Santoro,Victor Bapst,Yujia Li,Igor Babuschkin,Karl Tuyls,David Reichert,Timothy Lillicrap,Edward Lockhart,Murray Shanahan,Victoria Langston,Razvan Pascanu,Matthew Botvinick,Oriol Vinyals,Peter Battaglia
5+阅读 · 2018年6月28日
Ermo Wei,Drew Wicke,David Freelan,Sean Luke
10+阅读 · 2018年4月25日