赞
踩
状态价值函数:
V
π
(
s
t
)
=
∑
a
Q
π
(
s
t
,
a
)
⋅
π
(
a
∣
s
t
)
V_\pi(s_t)=\sum_aQ_\pi(s_t,a)\cdot\pi(a|s_t)
Vπ(st)=a∑Qπ(st,a)⋅π(a∣st)
通过策略网络近似策略函数:
π
(
a
∣
s
)
≈
π
(
a
∣
s
;
θ
)
\pi(a|s)\approx\pi(a|s;\theta)
π(a∣s)≈π(a∣s;θ)
通过价值网络近似动作价值函数:
q
(
s
,
a
;
W
)
≈
Q
(
s
,
a
)
q(s,a;W)\approx Q(s,a)
q(s,a;W)≈Q(s,a)
神经网络近似后的状态价值函数:
V
(
s
;
θ
,
W
)
=
∑
a
q
(
s
,
a
;
W
)
∗
π
(
a
∣
s
;
θ
)
V(s;\theta ,W)=\sum_aq(s,a;W)*\pi(a|s;\theta)
V(s;θ,W)=a∑q(s,a;W)∗π(a∣s;θ)
通过对策略网络不断更新以增加状态价值函数值。
通过对价值网络不断更新来更好的预测所获得的回报。
Policy Network
Policy Network
一次更新中,Agent执行一次动作,获得一次奖励。
该网络的收敛对于模型大小、激活函数等参数较敏感。
# -*- coding: utf-8 -*- # @Time : 2022/3/29 21:51 # @Author : CyrusMay WJ # @FileName: AC.py # @Software: PyCharm # @Blog :https://blog.csdn.net/Cyrus_May import tensorflow as tf import numpy as np import logging import sys import gym class Critic(): def __init__(self,logger=None,input_dim=6,gamma=0.9): self.logger = logger self.__build_model(input_dim) self.gamma = gamma self.optimizer = tf.optimizers.Adam(learning_rate=0.001) def __build_model(self,input_dim): self.model = tf.keras.Sequential([ tf.keras.layers.Dense(32, activation="relu"), tf.keras.layers.Dense(1) ]) self.model.build(input_shape=[None,input_dim]) def predict(self,action,state): action = tf.one_hot([action],depth=2) state = tf.convert_to_tensor([state]) x = tf.concat([action,state],axis=1) return self.model(x)[0][0] def train(self,state,state_,action,action_,reward,done): action = tf.one_hot([action], depth=2) state = tf.convert_to_tensor([state]) action_ = tf.one_hot([action_], depth=2) state_ = tf.convert_to_tensor([state_]) x = tf.concat([action, state], axis=1) x_ = tf.concat([action_, state_], axis=1) done = 0 if done else 1 with tf.GradientTape() as tape: q = self.model(x) q_ = self.model(x_) Td_error = (reward + done * self.gamma * q_ - q) loss = tf.square(Td_error) dt = tape.gradient(loss,self.model.trainable_variables) self.optimizer.apply_gradients(zip(dt,self.model.trainable_variables)) return Td_error class Actor(): def __init__(self,logger=None,input_dim=4,gamma=0.9,output_dim=2): self.logger = logger self.__build_model(input_dim,output_dim) self.gamma = gamma self.optimizer = tf.optimizers.Adam(learning_rate=0.001) self.output_dim = output_dim def __build_model(self,input_dim,output_dim=2): self.model = tf.keras.Sequential([ tf.keras.layers.Dense(32, activation="relu"), tf.keras.layers.Dense(output_dim) ]) self.model.build(input_shape=[None,input_dim]) def predict(self,state): state = tf.convert_to_tensor([state]) logits = self.model(state) prob = tf.nn.softmax(logits).numpy() action = np.random.choice([i for i in range(self.output_dim)],p=prob.ravel()) return action def train(self,state,action,TD_error,done): state = tf.convert_to_tensor([state]) with tf.GradientTape() as tape: logits = self.model(state) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = [action], logits=logits) loss = tf.reduce_sum(tf.multiply(TD_error,loss)) dt = tape.gradient(loss,self.model.trainable_variables) self.optimizer.apply_gradients(zip(dt,self.model.trainable_variables)) class Agent(): def __init__(self,gamma=0.9,logger=None): self.gamma = gamma self.logger = logger self.env = gym.make("CartPole-v0") self.actor = Actor(logger=logger,input_dim=4,gamma=self.gamma,output_dim=2) self.critic = Critic(logger = logger,input_dim=6,gamma=self.gamma) def train(self,tran_epochs=1000,max_act=100): history_returns = [] for epoch in range(tran_epochs): single_returns = 0 state = self.env.reset() for iter in range(max_act): self.env.render() action = self.actor.predict(state) state_,reward,done,info = self.env.step(action) action_ = self.actor.predict(state_) TD_error = self.critic.train(state,state_,action,action_,reward,done) self.actor.train(state,action,TD_error,done) single_returns+=(reward) state = state_ if done: break if history_returns: history_returns.append(history_returns[-1]*0.9+0.1*single_returns) else: history_returns.append( single_returns) self.logger.info("epoch:{}\{} || epoch return:{:,.4f} || history return:{:,.4f}".format(tran_epochs,epoch+1,single_returns,history_returns[-1])) self.env.close() def test(self,max_act=1000): state = self.env.reset() single_returns = 0 for iter in range(max_act): self.env.render() action = self.actor.predict(state) state_, reward, done, info = self.env.step(action) single_returns += (reward) if done: self.logger.info("End in {} iterations".format(iter+1)) break if not done: self.logger.info("success and return is {}".format(single_returns)) if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.INFO) screen_handler = logging.StreamHandler(sys.stdout) screen_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(module)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s') screen_handler.setFormatter(formatter) logger.addHandler(screen_handler) agent = Agent(logger=logger) agent.train(tran_epochs=2000,max_act=500) agent.test()
本文部分内容为参考B站学习视频书写的笔记!
by CyrusMay 2022 03 29
摸不到的颜色 是否叫彩虹
看不到的拥抱 是否叫做微风
————五月天(星空)————
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。