# DQN三大改进（二）-Prioritised replay

Prioritised replay原文：https://arxiv.org/pdf/1511.05952.pdf

1、背景

DQN中有两个关键的技术，叫做经验回放和双网络结构。

DQN中的损失函数定义为：

q-target如何计算呢？根据下面的公式：

SumTree 是一种树形结构, 每片树叶存储每个样本的优先级 p, 每个树枝节点只有两个分叉, 节点的值是两个分叉的合, 所以 SumTree 的顶端就是所有 p 的合. 如下图所示。最下面一层树叶存储样本的 p, 叶子上一层最左边的 13 = 3 + 10, 按这个规律相加, 顶层的 root 就是全部 p 的合了.

[0-7], [7-14], [14-21], [21-28], [28-35], [35-42]

2、代码实现

#---------------------input----------------------self.s = tf.placeholder(tf.float32,[None,self.n_features],name='s')self.q_target = tf.placeholder(tf.float32,[None,self.n_actions],name='Q_target')self.s_ = tf.placeholder(tf.float32, [None,self.n_features], name='s_')ifself.prioritized:self.ISWeights = tf.placeholder(tf.float32,[None,1],name='IS_weights')

defbuild_layers(s, c_names, n_l1, w_initializer, b_initializer, trainable): with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names, trainable=trainable) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names, trainable=trainable) l1 = tf.nn.relu(tf.matmul(s, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1,self.n_actions], initializer=w_initializer, collections=c_names, trainable=trainable) b2 = tf.get_variable('b2', [1,self.n_actions], initializer=b_initializer, collections=c_names, trainable=trainable) out = tf.matmul(l1, w2) + b2returnout

# ---------------------eval net -----------------with tf.variable_scope('eval_net'): c_names, n_l1, w_initializer, b_initializer = \ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES],20, \ tf.random_normal_initializer(0.,0.3), tf.constant_initializer(0.1)# config of layersself.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer,True)# --------------------target net----------------with tf.variable_scope('target_net'): c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer,False)

# --------------------loss and train -----------with tf.variable_scope('loss'):ifself.prioritized:self.abs_errors = tf.reduce_sum(tf.abs(self.q_target -self.q_eval), axis=1)# for updating Sumtreeself.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target,self.q_eval))else:self.loss = tf.reduce_mean(tf.squared_difference(self.q_target,self.q_eval))with tf.variable_scope('train'):self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

def__init__(self,capacity):self.capacity = capacityself.data_pointer =self.tree = np.zeros(2* capacity -1)self.data = np.zeros(capacity,dtype=object)@propertydeftotal_p(self):returnself.tree[]# the root

defadd(self,p,data): tree_idx =self.data_pointer +self.capacity -1self.data[self.data_pointer] = dataself.update(tree_idx,p)self.data_pointer +=1ifself.data_pointer >=self.capacity:# replace when exceed the capacityself.data_pointer =

def update(self,tree_idx,p): change = p -self.tree[tree_idx]self.tree[tree_idx] = pwhiletree_idx!=: tree_idx = (tree_idx -1)// 2self.tree[tree_idx] += change

def get_leaf(self,v): parent_idx =whileTrue: cl_idx =2* parent_idx +1cr_idx = cl_idx +1ifcl_idx >= len(self.tree): leaf_idx = parent_idxbreakelse:ifv

def__init__(self, capacity):self.tree = SumTree(capacity)self.epsilon =.01# small amount to avoid zero priorityself.alpha =.6# [0~1] convert the importance of TD error to priorityself.beta =.4# importance-sampling, from initial value increasing to 1self.beta_increment_per_sampling =.001self.abs_err_upper =1.# clipped abs error

defstore(self, transition): max_p = np.max(self.tree.tree[-self.tree.capacity:])ifmax_p ==: max_p =self.abs_err_upperself.tree.add(max_p, transition)# set the max p for new p

defsample(self,n): b_idx,b_memory,ISWeights = np.empty((n,),dtype=np.int32),np.empty((n,self.tree.data[].size)),np.empty((n,1)) pri_seg =self.tree.total_p / nself.beta = np.min([1.,self.beta +self.beta_increment_per_sampling]) min_prob = np.min(self.tree.tree[-self.tree.capacity:]) /self.tree.total_p# for later calculate ISweightforiinrange(n): a, b = pri_seg * i, pri_seg * (i +1) v = np.random.uniform(a, b) idx, p, data =self.tree.get_leaf(v) prob = p /self.tree.total_p ISWeights[i,] = np.power(prob/min_prob, -self.beta) b_idx[i], b_memory[i,:] = idx, datareturnb_idx, b_memory, ISWeights

defstore(self,s,a,r,s_):ifself.prioritized:transition = np.hstack((s, [a, r], s_))self.memory.store(transition)else:# random replayifnothasattr(self,'memory_counter'):self.memory_counter =transition = np.hstack((s, [a, r], s_)) index =self.memory_counter %self.memory_sizeself.memory[index,:] = transitionself.memory_counter +=1

q_next, q_eval =self.sess.run( [self.q_next,self.q_eval], feed_dict={self.s_: batch_memory[:, -self.n_features:],self.s: batch_memory[:, :self.n_features]})q_target = q_eval.copy()batch_index = np.arange(self.batch_size, dtype=np.int32)eval_act_index = batch_memory[:,self.n_features].astype(int)reward = batch_memory[:,self.n_features +1]q_target[batch_index, eval_act_index] = reward +self.gamma * np.max(q_next, axis=1)ifself.prioritized: _, abs_errors,self.cost =self.sess.run([self._train_op,self.abs_errors,self.loss], feed_dict={self.s: batch_memory[:, :self.n_features],self.q_target: q_target,self.ISWeights: ISWeights})self.memory.batch_update(tree_idx, abs_errors)# update priorityelse: _,self.cost =self.sess.run([self._train_op,self.loss], feed_dict={self.s: batch_memory[:, :self.n_features],self.q_target: q_target})self.cost_his.append(self.cost)self.epsilon =self.epsilon +self.epsilon_incrementifself.epsilon 3、参考文献

1、Prioritized Experience Replay (DQN) (Tensorflow)：https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-6-prioritized-replay/

2、PRIORITIZED EXPERIENCE REPLAY：https://www.cnblogs.com/wangxiaocvpr/p/5660232.html

• 发表于:
• 原文链接http://kuaibao.qq.com/s/20180304G0Q0X500?refer=cp_1026
• 腾讯「云+社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。

2018-05-10

2018-03-30

2018-03-27

2019-10-21

2019-10-21

2019-10-21