赞
踩
在Q-Learning和Sarsa算法中,我们使用了一种数据结构:Q表,用Q表存储所有的状态以及每个action的Q值。在现实问题中,不仅action种类可能非常多,state的数量更可能是指数级,这就为我们存储Q表和查找对应状态带来了很大的麻烦。在机器学习中,我们可以将state和action输入到神经网络中,神经网络分析后输出action的Q值,然后我们再根据Q-learning的原则执行之后的操作。
通过神经网络,我们省去了存储和查找的时间,同时网络还能够捕捉到一些细节特征,极大的促进了强化学习的发展。
在Q-Learning中,我们更新参数时,需要两个值:Q现实和Q估计,这两个值都记录在Q表中。
在DQN中,我们要更新神经网络的参数,同样需要这两个值,那如何获得呢?
首先我们通过神经网络预测出Q估计:
Q
(
s
,
a
1
)
Q
(
s
,
a
2
)
Q(s,a1)\ \ \ Q(s,a2)
Q(s,a1) Q(s,a2)
Q现实也是神经网络预测出的Q值,不过是对s的下一步s’的估计,一般由target network(后面会提到)给出:
Q
现
实
=
R
+
γ
∗
max
[
Q
(
s
′
,
a
1
)
,
Q
(
s
′
,
s
2
)
]
Q_{现实}=R+\gamma*\max[Q(s',a1),Q(s',s2)]
Q现实=R+γ∗max[Q(s′,a1),Q(s′,s2)]
更新参数:
α
(
Q
现
实
−
Q
估
计
)
\alpha(Q_{现实}-Q_{估计})
α(Q现实−Q估计)
Initialize replay memort
D
D
D to capacity
N
N
N
Initialize action-value function
Q
Q
Q with randon weights
θ
\theta
θ
Initialize target action-value function
Q
^
\hat{Q}
Q^ with weight
θ
−
=
θ
\theta^-=\theta
θ−=θ
For episode = 1,M do:
Initialize sequence
s
1
=
{
x
1
}
s_1=\{x_1\}
s1={x1} and preprocessed sequence
ϕ
=
ϕ
(
s
1
)
\phi=\phi(s_1)
ϕ=ϕ(s1)
For t=1,T *do
With probablity
ε
\varepsilon
ε select a random action
a
t
a_t
at
otherwise select
a
t
=
arg
max
a
Q
(
ϕ
(
s
t
)
,
a
:
θ
)
a_t=\arg\ \max_aQ(\phi(s_t),a:\theta)
at=arg maxaQ(ϕ(st),a:θ)
Execute action
a
t
a_t
at in emulator and observe reward
r
t
r_t
rt and image
x
t
+
1
x_{t+1}
xt+1
Set
s
t
+
1
=
s
t
,
a
t
,
x
t
+
1
s_{t+1}=s_t,a_t,x_{t+1}
st+1=st,at,xt+1 and preprocess
ϕ
t
+
!
=
ϕ
(
s
t
+
1
)
\phi_{t+!}=\phi(s_{t+1})
ϕt+!=ϕ(st+1)
Store transition
(
ϕ
t
,
a
t
,
r
t
,
ϕ
t
+
1
)
(\phi_t,a_t,r_t,\phi_{t+1})
(ϕt,at,rt,ϕt+1) in
D
D
D
Sample random minibatch of transitions
(
ϕ
j
,
a
j
,
r
j
,
ϕ
j
+
1
)
(\phi_j,a_j,r_j,\phi_{j+1})
(ϕj,aj,rj,ϕj+1) from
D
D
D
Set
y
j
=
{
r
j
i
f
e
p
i
s
o
d
e
t
e
r
m
i
m
a
t
e
s
a
t
s
t
e
p
j
+
1
r
j
+
γ
max
a
′
Q
^
(
ϕ
j
+
1
,
a
′
:
θ
−
)
o
t
h
e
r
w
i
z
e
y_j=\left\{
Permorm a gradient descent step on
(
y
j
−
Q
(
ϕ
j
,
a
j
:
θ
)
)
2
(y_j-Q(\phi_j,a_j:\theta))^2
(yj−Q(ϕj,aj:θ))2 with respect to the network parameters
θ
\theta
θ
Every
C
C
C steps reset
Q
^
=
Q
\hat{Q}=Q
Q^=Q
End For
End For
详细信息可查看 莫烦 强化学习
环境代码详见 maze_env.py
【reinforcement learning】Q-Learning简介
main.py
from maze_env import Maze from RL_brain import DeepQNetwork def run_maze(): step = 0 for episode in range(600): observation = env.reset() while True: env.render() action = RL.choose_action(observation) observation_,reward,done = env.step(action) RL.store_transition(observation,action,reward,observation_) if(step > 200) and (step % 5 ==0): RL.learn() observation = observation_ if done: break step += 1 print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = DeepQNetwork(env.n_actions,env.n_features,learning_rate=0.01, reward_decay=0.9,e_greedy=0.9,replace_target_iter=200, memory_size=2000) env.after(100,run_maze) env.mainloop() RL.plot_cost()
RL_brain.py
import numpy as np import pandas as pd import tensorflow as tf np.random.seed(1) tf.set_random_seed(1) # Deep Q Network off-policy class DeepQNetwork: def __init__( self, n_actions, n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=300, memory_size=500, batch_size=32, e_greedy_increment=None, output_graph=False, ): self.n_actions = n_actions self.n_features = n_features self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.learn_step_counter = 0 self.memory =np.zeros((self.memory_size,n_features*2+2)) self._build_net() t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.sess = tf.Session() if output_graph: tf.summary.FileWriter("./logs/",self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.cost_his = [] def _build_net(self): # enval_network self.s = tf.placeholder(tf.float32,[None,self.n_features],name='s') self.q_target = tf.placeholder(tf.float32,[None,self.n_actions],name='Q_target') with tf.variable_scope('eval_net'): c_names,n_l1,w_initializer,b_initializer = ['eval_net_params',tf.GraphKeys.GLOBAL_VARIABLES],10,tf.random_normal_initializer(0.,0.3),tf.constant_initializer(0.1) with tf.variable_scope('l1'): w1 = tf.get_variable('w1',[self.n_features,n_l1],initializer=w_initializer,collections=c_names) b1 = tf.get_variable('b1',[1,n_l1],initializer=b_initializer,collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s,w1)+b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2',[n_l1,self.n_actions],initializer=w_initializer,collections=c_names) b2 = tf.get_variable('b2',[1,self.n_actions],initializer=b_initializer,collections=c_names) self.q_eval = tf.matmul(l1,w2)+b2 with tf.variable_scope('loss'): self.loss = tf.reduce_mean(tf.squared_difference(self.q_target,self.q_eval)) with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) #target_network self.s_ = tf.placeholder(tf.float32,[None,self.n_features],name='s_') with tf.variable_scope('target_net'): c_names = ['target_net_params',tf.GraphKeys.GLOBAL_VARIABLES] with tf.variable_scope('l1'): w1 = tf.get_variable('W1',[self.n_features,n_l1],initializer=w_initializer,collections=c_names) b1 = tf.get_variable('b1',[1,n_l1],initializer=b_initializer,collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s_,w1)+b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2',[n_l1,self.n_actions],initializer=w_initializer,collections=c_names) b2 = tf.get_variable('b2',[1,self.n_actions],initializer=b_initializer,collections=c_names) self.q_next = tf.matmul(l1,w2)+b2 def store_transition(self,s,a,r,s_): if not hasattr(self,'memory_counter'): self.memory_counter = 0 transition = np.hstack((s,[a,r],s_)) index = self.memory_counter % self.memory_size self.memory[index,:] = transition self.memory_counter += 1 # # def choose_action(self,observation): observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: actions_value = self.sess.run(self.q_eval , feed_dict={self.s : observation}) action = np.argmax(actions_value) else: action = np.random.randint(0,self.n_actions) return action def _replace_target_params(self): t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.sess.run([tf.assign(t,e) for t,e in zip(t_params,e_params)]) def learn(self): if self.learn_step_counter % self.replace_target_iter == 0: self._replace_target_params() print('\nreplace......\n') if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size,size=self.memory_size) else: sample_index = np.random.choice(self.memory_counter,self.batch_size) batch_memory = self.memory[sample_index,:] q_next,q_eval = self.sess.run( [self.q_next,self.q_eval], feed_dict={ self.s_ : batch_memory[:,-self.n_features], self.s : batch_memory[:,self.n_features] } ) q_target = q_eval.copy() batch_index = np.arange(self.batch_size,dtype=np.int32) eval_act_index = batch_memory[:,self.n_features].astype(int) reward = batch_memory[:,self.n_features+1] q_target[batch_index,eval_act_index] = reward + self.gamma*np.max(q_next,axis=1) _, self.cost = self.sess.run([self._train_op,self.loss], feed_dict={ self.s : batch_memory.ioc[:,:self.n_features], self.q_target:q_target }) self.cost_his.append(self.cost) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1 def plot_cost(self): import matplotlib.pyplot as plt plt.plot(np.arange(len(self.cost_his)), self.cost_his) plt.ylabel('Cost') plt.xlabel('training steps') plt.show() def choose_action(self, observation): # to have batch dimension when feed into tf placeholder observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: # forward feed the observation and get q value for every actions actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value) else: action = np.random.randint(0, self.n_actions) return action def learn(self): # check to replace target parameters if self.learn_step_counter % self.replace_target_iter == 0: self.sess.run(self.replace_target_op) print('\ntarget_params_replaced\n') # sample batch memory from all memory if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] q_next, q_eval = self.sess.run( [self.q_next, self.q_eval], feed_dict={ self.s_: batch_memory[:, -self.n_features:], # fixed params self.s: batch_memory[:, :self.n_features], # newest params }) # change q_target w.r.t q_eval's action q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, self.n_features].astype(int) reward = batch_memory[:, self.n_features + 1] q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) # train eval network _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={self.s: batch_memory[:, :self.n_features], self.q_target: q_target}) self.cost_his.append(self.cost) # increasing epsilon self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1 def plot_cost(self): import matplotlib.pyplot as plt plt.plot(np.arange(len(self.cost_his)), self.cost_his) plt.ylabel('Cost') plt.xlabel('training steps') plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。