赞
踩
参考莫烦Python的学习视频链接: 莫烦Python的学习视频.
简单来说,因为难以在
p
(
x
)
p(x)
p(x)中采样,所以曲线救国,从
q
(
x
)
q(x)
q(x)采样求期望,再乘以一个weight,即
p
(
x
)
/
q
(
x
)
p(x)/q(x)
p(x)/q(x)
如果
p
(
x
)
p(x)
p(x)与
q
(
x
)
q(x)
q(x)差距很大,要多采样才行,采样数少会错误
替换原理
原问题中的替换原理。when to stop? 引入PPO解决原网络与现在网络不能差太多的问题,即两个分布不可以差太多
4. 算法伪代码
PPO-Penalty:近似地解决了TRPO之类的受KL约束的更新,但对目标函数中的KL偏离进行了惩罚而不是使其成为硬约束,并在训练过程中自动调整惩罚系数,以便对其进行适当缩放。
PPO-Clip:在目标中没有KL散度项,也完全没有约束。取而代之的是依靠对目标函数的专门裁剪来减小新老策略的差异。
KL散度用来限制新策略的更新幅度(重要)
在PPO clip中去掉了KL散度的计算,只限制了比例。效果更好。
多线程将加快学习进程。
5. 算法结构
class PPO: def __init__(self): # 建 Actor Critic 网络 # 搭计算图纸 graph self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') # 状态空间[None, S_DIM] self._build_anet('Critic') # 建立critic网络,更新self.v # 得到self.v之后计算损失函数 with tf.variable_scope('closs'): self.tfdc_r = tf.placeholder(tf.float32, [None, 1], name='discounted_r') # 折扣奖励 self.adv = self.tfdc_r - self.v # ?这个可以理解TD error吗? closs = tf.reduce_mean(tf.square(self.adv)) # critic的损失函数 self.ctrain = tf.train.AdamOptimizer(C_LR).minimize(closs) # 接着训练critic # 建立pi网络和old_pi网络,获得相应参数 pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) # ??这是什么 with tf.variable_scope('sample_action'): self.sample_op = tf.squeeze(pi.sample(1), axis=0) # 将新pi参数赋给old_pi with tf.variable_scope('update_oldpi'): # 此时还没有赋值,要sess.run才行 self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] with tf.variable_scope('aloss'): self.tfa = tf.placeholder(dtype=tf.float32, shape=[None, A_DIM], name='action') # 动作空间 self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') # 优势函数 with tf.variable_scope('surrogate'): ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa) # 概率密度 surr = ratio * self.tfadv # 差异大,奖励大惊讶度高 if METHOD['name'] == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') kl = tf.distributions.kl_divergence(oldpi, pi) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.tflam * kl)) else: # clipping method, find this is better 限制了surrogate的变化幅度 self.aloss = -tf.reduce_mean(tf.minimum(surr, tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD[ 'epsilon']) * self.tfadv)) # 限定ratio的范围,我也不懂这个参数是怎么调的 self.atrain = tf.train.AdamOptimizer(A_LR).minimize(self.aloss) # A_LR学习率,损失函数aloss # 写日志文件 tf.summary.FileWriter('log/', self.sess.graph) self.sess.run(tf.global_variables_initializer()) # 搭建网络函数 def _build_anet(self, name, trainable=True): # Critic网络部分 if name == 'Critic': with tf.variable_scope(name): # self.s_Critic = tf.placeholder(tf.float32, [None, S_DIM], 'state') # 两层神经网络,输出是self.v,即估计state value l1_Critic = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable, name='l1') self.v = tf.layers.dense(l1_Critic, 1, trainable=trainable, name='value_predict') # Actor部分,分为‘pi’和‘oldpi’两个神经网络 # 返回动作分布以及网络参数列表 else: with tf.variable_scope(name): # self.s_Actor = tf.placeholder(tf.float32, [None, S_DIM], 'state') # ??这部分 l1_Actor = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable, name='l1') mu = 2 * tf.layers.dense(l1_Actor, A_DIM, tf.nn.tanh, trainable=trainable, name='mu') sigma = tf.layers.dense(l1_Actor, A_DIM, tf.nn.softplus, trainable=trainable, name='sigma') norm_list = tf.distributions.Normal(loc=mu, scale=sigma) # 正态分布 params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) #提取网络参数列表 return norm_list, params def update(self, s, a, r): # 将值赋给old_pi网络 self.sess.run(self.update_oldpi_op) #为了取回self.adv, adv = self.sess.run(self.adv, {self.tfdc_r: r, self.tfs: s}) # 后面那个字典是什么意思? if METHOD['name'] == 'kl_pen': # 选择kl-penalty方式 for _ in range(A_UPDATE_STEPS): _, kl = self.sess.run([self.atrain, self.kl_mean], {self.tfa: a, self.tfadv: adv, self.tfs: s, self.tflam: METHOD['lam']}) if kl > 4 * METHOD['kl_target']: # this in in google's paper break if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper METHOD['lam'] /= 2 elif kl > METHOD['kl_target'] * 1.5: METHOD['lam'] *= 2 METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution else: # 训练actor网络 [self.sess.run(self.atrain, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)] # 训练critic网络 [self.sess.run(self.ctrain, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)] def choose_action(self, s): # 选动作 s = s[np.newaxis, :] a = self.sess.run(self.sample_op, {self.tfs: s})[0] return np.clip(a, -2, 2) def get_v(self, s): # 算 state value if s.ndim < 2: s = s[np.newaxis, :] return self.sess.run(self.v, {self.tfs: s}) env = gym.make('Pendulum-v0').unwrapped S_DIM = env.observation_space.shape[0] A_DIM = env.action_space.shape[0] ppo = PPO() all_ep_r = [] # ppo和环境的互动 # 达到最大回合数退出 for ep in range(EP_MAX): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(EP_LEN): env.render() a = ppo.choose_action(s) s_, r, done, _ = env.step(a) # 存储在buffer当中 buffer_s.append(s) buffer_a.append(a) buffer_r.append((r + 8) / 8) s = s_ ep_r += r # 如果buffer收集一个batch了或者episode结束了 if (t + 1) % BATCH == 0 or t == EP_LEN - 1: # 计算discounted reward v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(discounted_r) # 清空buffer buffer_s, buffer_a, buffer_r = [], [], [] ppo.update(bs, ba, br) # 更新PPO if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) print('Ep:%d | Ep_r:%f' % (ep, ep_r)) plt.plot(np.arange(len(all_ep_r)), all_ep_r) plt.xlabel('Episode') plt.ylabel('Moving averaged episode reward') plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。