# 详解深度强化学习展现TensorFlow 2.0新特性

TensorFlow 2.0的特性公布已经有一段时间了，但很多人对此应当还是一头雾水。

> conda create -n tf2 python=3.6> source activate tf2> pip install tf-nightly-2.0-preview # tf-nightly-gpu-2.0-preview for GPU version

>>> import tensorflow as tf>>> print(tf.__version__)1.13.0-dev20190117>>> print(tf.executing_eagerly())True

>>> print(tf.reduce_sum([1, 2, 3, 4, 5]))tf.Tensor(15, shape=(), dtype=int32)

https://www.tensorflow.org/tutorials/eager/eager_basics

Actor-Critic方法

RL算法通常根据优化的目标函数进行分组。基于值的方法（如DQN）通过减少预期状态-动作值(state-action value)的误差来工作。

import numpy as np

import tensorflow as tf

import tensorflow.keras.layers as kl

class ProbabilityDistribution(tf.keras.Model):

defcall(self, logits):

tf.squeeze(tf.random.categorical(logits, 1), axis=-1)

classModel(tf.keras.Model):def__init__(self, num_actions):

super().__init__('mlp_policy')

self.hidden1 = kl.Dense(128, activation='relu')

self.hidden2 = kl.Dense(128, activation='relu')

self.value = kl.Dense(1, name='value')

self.logits = kl.Dense(num_actions, name='policy_logits')

self.dist = ProbabilityDistribution()defcall(self, inputs):

x = tf.convert_to_tensor(inputs, dtype=tf.float32)

hidden_logs = self.hidden1(x)

hidden_vals = self.hidden2(x)

return self.logits(hidden_logs), self.value(hidden_vals)

defaction_value(self, obs)

value = self.predict(obs)

action = self.dist.predict(logits)

import gymenv = gym.make('CartPole-v0')

model = Model(num_actions=env.action_space.n)

obs = env.reset()

value = model.action_value(obs[None, :])

print(action, value)

Random Agent

classA2CAgent:def__init__(self, model):

self.model = model

deftest(self, env, render=True):

obs, done, ep_reward = env.reset(), False, 0

while not done:

action, _ = self.model.action_value(obs[None, :])

obs, reward, done, _ = env.step(action)

ep_reward += rewardif render: env.render()

return ep_reward

agent = A2CAgent(model)

rewards_sum = agent.test(env)

print("%d out of 200" % rewards_sum)

import tensorflow.keras.losses as kls

import tensorflow.keras.optimizers as ko

class A2CAgent:def__init__(self, model):

termsself.params = {'value': 0.5, 'entropy': 0.0001}

self.model = modelself.model.compile( optimizer=ko.RMSprop(lr=0.0007)

loss=[self._logits_loss, self._value_loss] )

deftest(self, env, render=True):

def_value_loss(self, returns, value):

self.params['value']*kls.mean_squared_error(returns, value)

cross_entropy = kls.CategoricalCrossentropy(from_logits=True)

actions = tf.cast(actions, tf.int32)

entropy_loss = cross_entropy(logits, logits)

return policy_loss - self.params['entropy']*entropy_loss

Agent Training Loop

classA2CAgent:def__init__(self, model):

self.params = {'value': 0.5, 'entropy': 0.0001, 'gamma': 0.99}

actions = np.empty((batch_sz,), dtype=np.int32)

rewards, dones, values = np.empty((3, batch_sz))

observations = np.empty((batch_sz,) + env.observation_space.shape)

ep_rews = [0.0]

next_obs = env.reset()

for step in range(batch_sz):

observations[step] = next_obs.copy()

actions[step], values[step] = self.model.action_value(next_obs[None, :])

next_obs, rewards[step], dones[step], _ = env.step(actions[step])

ep_rews[-1] += rewards[step]

if dones[step]:

ep_rews.append(0.0)

next_obs = env.reset()

_, next_value = self.model.action_value(next_obs[None, :])

return ep_rews

def _returns_advantages(self, rewards, dones, values, next_value):

returns = np.append(np.zeros_like(rewards), next_value, axis=-1)

for t in reversed(range(rewards.shape[0])):

returns[t] = rewards[t] + self.params['gamma'] * returns[t+1] * (1-dones[t])

returns = returns[:-1]

def test(self, env, render=True):

def _value_loss(self, returns, value):

rewards_history = agent.train(env)

print("Finished training, testing...")

print("%d out of 200" % agent.test(env))

eager mode效果这么好，你可能会想知道静态图执行是否也可以。当然是可以！而且，只需要多加一行代码就可以启用静态图执行。

with tf.Graph().as_default():

print(tf.executing_eagerly())

model = Model(num_actions=env.action_space.n)

agent = A2CAgent(model)

rewards_history = agent.train(env)

print("Finished training, testing...")

print("%d out of 200" % agent.test(env))

One More Thing…

env = gym.make('CartPole-v0')

obs = np.repeat(env.reset()[None, :], 100000, axis=0)

Eager Benchmark

%%timemodel = Model(env.action_space.n)

model.run_eagerly = Trueprint("Eager Execution: ", tf.executing_eagerly())

print("Eager Keras Model:", model.run_eagerly)

_ = model(obs)

######## Results #######

Eager Execution: TrueEager Keras Model: TrueCPU times: user 639 ms, sys: 736 ms, total: 1.38 s

Static Benchmark

%%timewith tf.Graph().as_default():

model = Model(env.action_space.n)

print("Eager Execution: ", tf.executing_eagerly())

print("Eager Keras Model:", model.run_eagerly)

_ = model.predict(obs)

######## Results #######

Eager Execution: FalseEager Keras Model: FalseCPU times: user 793 ms, sys: 79.7 ms, total: 873 ms

Default Benchmark

%%timemodel = Model(env.action_space.n)

print("Eager Execution: ", tf.executing_eagerly())

print("Eager Keras Model:", model.run_eagerly)

_ = model.predict(obs)

######## Results #######

Eager Execution: TrueEager Keras Model: FalseCPU times: user 994 ms, sys: 23.1 ms, total: 1.02 s

232 篇文章47 人订阅

0 条评论