当前位置:   article > 正文

莫烦python强化学习系列-DQN学习(代码)_n_features

n_features
  1. import numpy as np
  2. import pandas as pd
  3. import tensorflow as tf
  4. np.random.seed(1)
  5. tf.set_random_seed(1)
  6. # Deep Q Network off-policy
  7. class DeepQNetwork:
  8. def __init__(
  9. self,
  10. n_actions,
  11. n_features,
  12. learning_rate=0.01,
  13. reward_decay=0.9,
  14. e_greedy=0.9,
  15. replace_target_iter=300,
  16. memory_size=500,
  17. batch_size=32,
  18. e_greedy_increment=None,
  19. output_graph=False,
  20. ):
  21. self.n_actions = n_actions
  22. self.n_features = n_features
  23. self.lr = learning_rate
  24. self.gamma = reward_decay
  25. self.epsilon_max = e_greedy
  26. self.replace_target_iter = replace_target_iter
  27. self.memory_size = memory_size
  28. self.batch_size = batch_size
  29. self.epsilon_increment = e_greedy_increment
  30. self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
  31. # total learning step
  32. self.learn_step_counter = 0
  33. # initialize zero memory [s, a, r, s_]
  34. self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
  35. # consist of [target_net, evaluate_net]
  36. self._build_net()
  37. #tf.get_collection(key, scope=None)
  38. #用来获取一个名称是‘key’的集合中的所有元素,返回的是一个列表
  39. t_params = tf.get_collection('target_net_params')
  40. e_params = tf.get_collection('eval_net_params')
  41. self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
  42. self.sess = tf.Session()
  43. if output_graph:
  44. # $ tensorboard --logdir=logs
  45. # tf.train.SummaryWriter soon be deprecated, use following
  46. tf.summary.FileWriter("logs/", self.sess.graph)
  47. self.sess.run(tf.global_variables_initializer())
  48. self.cost_his = []
  49. def _build_net(self):
  50. # ------------------ build evaluate_net ------------------
  51. self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
  52. # input
  53. self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
  54. with tf.variable_scope('eval_net'):
  55. # c_names(collections_names) are the collections to store variables
  56. c_names, n_l1, w_initializer, b_initializer = \
  57. ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
  58. tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
  59. # first layer. collections is used later when assign to target net
  60. with tf.variable_scope('l1'):
  61. w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
  62. b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
  63. l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
  64. # second layer. collections is used later when assign to target net
  65. with tf.variable_scope('l2'):
  66. w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
  67. b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
  68. self.q_eval = tf.matmul(l1, w2) + b2 #[batch_size,self.n_action]
  69. with tf.variable_scope('loss'):
  70. self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
  71. with tf.variable_scope('train'):
  72. self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
  73. # ------------------ build target_net ------------------
  74. self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
  75. with tf.variable_scope('target_net'):
  76. # c_names(collections_names) are the collections to store variables
  77. c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
  78. # first layer. collections is used later when assign to target net
  79. with tf.variable_scope('l1'):
  80. w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
  81. b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
  82. l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
  83. # second layer. collections is used later when assign to target net
  84. with tf.variable_scope('l2'):
  85. w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
  86. b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
  87. self.q_next = tf.matmul(l1, w2) + b2
  88. def store_transition(self, s, a, r, s_):
  89. if not hasattr(self, 'memory_counter'):
  90. self.memory_counter = 0
  91. transition = np.hstack((s, [a, r], s_))
  92. # replace the old memory with new memory
  93. index = self.memory_counter % self.memory_size
  94. self.memory[index, :] = transition
  95. self.memory_counter += 1
  96. def choose_action(self, observation):
  97. # to have batch dimension when feed into tf placeholder
  98. observation = observation[np.newaxis, :] #shape=(1,n_features)
  99. if np.random.uniform() < self.epsilon:
  100. # forward feed the observation and get q value for every actions
  101. actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
  102. action = np.argmax(actions_value)#未加axis=,返回一个索引数值
  103. else:
  104. action = np.random.randint(0, self.n_actions)
  105. return action
  106. def learn(self):
  107. # check to replace target parameters
  108. if self.learn_step_counter % self.replace_target_iter == 0:
  109. self.sess.run(self.replace_target_op)
  110. print('\ntarget_params_replaced\n')
  111. # sample batch memory from all memory
  112. if self.memory_counter > self.memory_size:
  113. sample_index = np.random.choice(self.memory_size, size=self.batch_size)
  114. else:
  115. sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
  116. batch_memory = self.memory[sample_index, :]
  117. q_next, q_eval = self.sess.run(
  118. [self.q_next, self.q_eval],
  119. feed_dict={
  120. #[s, a, r, s_]
  121. self.s_: batch_memory[:, -self.n_features:], # fixed params
  122. self.s: batch_memory[:, :self.n_features], # newest params
  123. })
  124. # change q_target w.r.t q_eval's action
  125. q_target = q_eval.copy()
  126. batch_index = np.arange(self.batch_size, dtype=np.int32)
  127. eval_act_index = batch_memory[:, self.n_features].astype(int) #action astype(int) 转换数组的数据类型
  128. reward = batch_memory[:, self.n_features + 1] #reward
  129. q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
  130. """
  131. For example in this batch I have 2 samples and 3 actions:
  132. q_eval =
  133. [[1, 2, 3],
  134. [4, 5, 6]]
  135. q_target = q_eval =
  136. [[1, 2, 3],
  137. [4, 5, 6]]
  138. Then change q_target with the real q_target value w.r.t the q_eval's action.
  139. For example in:
  140. sample 0, I took action 0, and the max q_target value is -1;
  141. sample 1, I took action 2, and the max q_target value is -2:
  142. q_target =
  143. [[-1, 2, 3],
  144. [4, 5, -2]]
  145. So the (q_target - q_eval) becomes:
  146. [[(-1)-(1), 0, 0],
  147. [0, 0, (-2)-(6)]]
  148. We then backpropagate this error w.r.t the corresponding action to network,
  149. leave other action as error=0 cause we didn't choose it.
  150. """
  151. # train eval network
  152. _, self.cost = self.sess.run([self._train_op, self.loss],
  153. feed_dict={self.s: batch_memory[:, :self.n_features],
  154. self.q_target: q_target})
  155. self.cost_his.append(self.cost)
  156. # increasing epsilon
  157. self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
  158. self.learn_step_counter += 1
  159. def plot_cost(self):
  160. import matplotlib.pyplot as plt
  161. plt.plot(np.arange(len(self.cost_his)), self.cost_his)
  162. plt.ylabel('Cost')
  163. plt.xlabel('training steps')
  164. plt.show()
  1. from maze_env import Maze
  2. from RL_brain import DeepQNetwork
  3. def run_maze():
  4. step = 0
  5. for episode in range(300):
  6. # initial observation
  7. observation = env.reset()
  8. while True:
  9. # fresh env
  10. env.render()
  11. # RL choose action based on observation
  12. action = RL.choose_action(observation)
  13. # RL take action and get next observation and reward
  14. observation_, reward, done = env.step(action)
  15. RL.store_transition(observation, action, reward, observation_)
  16. if (step > 200) and (step % 5 == 0):
  17. RL.learn()
  18. # swap observation
  19. observation = observation_
  20. # break while loop when end of this episode
  21. if done:
  22. break
  23. step += 1
  24. # end of game
  25. print('game over')
  26. env.destroy()
  27. if __name__ == "__main__":
  28. # maze game
  29. env = Maze()
  30. RL = DeepQNetwork(env.n_actions, env.n_features,
  31. learning_rate=0.01,
  32. reward_decay=0.9,
  33. e_greedy=0.9,
  34. replace_target_iter=200,
  35. memory_size=2000,
  36. # output_graph=True
  37. )
  38. env.after(100, run_maze)
  39. env.mainloop()
  40. RL.plot_cost()

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/295928?site
推荐阅读
相关标签
  

闽ICP备14008679号