当前位置:   article > 正文

【强化学习】tensorflow2.x 构造 SoftActorCritic(SAC) 训练 LunarLanderContinuous-v2_提供一个在tensorflow下sac的代码

提供一个在tensorflow下sac的代码

  • 论文地址.
  • 策略网络的损失函数不是按照原论文所写。(X)
  • 更新:策略网络的损失函数已按照原论文更新。
  • 更新:加入了自适应温度参数 alpha 控制策略熵。

requirements.txt:
tensorflow-gpu==2.4.0
gym[all]==0.21.0
tensorflow_probability==0.14.0
keras==2.6.0
matplotlib==3.5.1
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
from tensorflow.keras import layers, models, Input, optimizers, losses
from tensorflow_probability.python.distributions import Normal
from collections import deque

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import copy
import gym

class SoftActorCritic:
    def __init__(self, state_shape, action_dim):
        self.ema = tf.train.ExponentialMovingAverage(decay=0.995)
        self.replay_buffer = deque(maxlen=10000)
        self.gamma = 0.997

        self.log_aplha = tf.Variable(np.random.normal(), trainable=True, name="EntropyTemperature")
        self.mini_entropy = 0.1

        self.policy_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q1_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q2_OPT = optimizers.RMSprop(learning_rate=1e-3)
        self.value_OPT = optimizers.Adam(learning_rate=1e-3)
        self.alpha_OPT = optimizers.SGD(learning_rate=1e-3)

        policy_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(policy_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        policy_mean = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std_clipped = tf.clip_by_value(log_policy_std, -10, 2)
        self.policy_network = models.Model(inputs=policy_input, outputs=[policy_mean, log_policy_std_clipped])

        value_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(value_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        value_output = layers.Dense(units=1, activation='linear')(x)
        self.value_network = models.Model(inputs=value_input, outputs=value_output)
        self.target_value_network = models.clone_model(self.value_network)
        self._update_target_value_network()

        Q_state_input = Input(shape=state_shape)
        Q_action_input = Input(shape=(action_dim))
        x = layers.concatenate([Q_state_input, Q_action_input])
        x = layers.Dense(units=1024, activation='relu')(x)
        x = layers.Dense(units=1024, activation='relu')(x)
        Q_output = layers.Dense(units=1, activation='linear')(x)
        self.Q_network_1 = models.Model(inputs=[Q_state_input, Q_action_input], outputs=Q_output)
        self.Q_network_2 = models.clone_model(self.Q_network_1)

    def _update_target_value_network(self):
        self.ema.apply(self.value_network.trainable_variables)
        for target_value_network_para, value_network_para in zip(self.target_value_network.trainable_variables, self.value_network.trainable_variables):
            target_value_network_para.assign(self.ema.average(value_network_para))

    def save_memory(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def select_action(self, state):
        state = np.array([state])
        policy_mean, log_policy_std = self.policy_network(state)
        policy_mean = np.array(policy_mean[0])
        log_policy_std = np.array(log_policy_std[0])
        policy_std = np.exp(log_policy_std)
        gaussian_distribution = Normal(policy_mean, policy_std)
        action = np.tanh(gaussian_distribution.sample())
        return action

    def update_weights(self, batch_size):
        batch_size = min(batch_size, len(self.replay_buffer))
        training_data = random.sample(self.replay_buffer, batch_size)
        state, action, reward, next_state, done = [], [], [], [], []
        for data in training_data:
            s, a, r, n_s, d = data
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(n_s)
            done.append(d)
        state = np.array(state, dtype=np.float64)
        action = np.array(action, dtype=np.float64)
        reward = np.reshape(reward, newshape=(-1, 1))
        next_state = np.array(next_state, dtype=np.float64)
        done = np.reshape(done, newshape=(-1, 1))

        with tf.GradientTape() as tape:
            policy_mean, log_policy_std = self.policy_network(state)
            policy_std = tf.exp(log_policy_std)

            gaussian_distribution = Normal(policy_mean, policy_std)
            gaussian_sampling = gaussian_distribution.sample()

            sample_action = tf.tanh(gaussian_sampling)
            logprob = gaussian_distribution.log_prob(gaussian_sampling) - tf.math.log(
                1.0 - tf.pow(sample_action, 2) + 1e-6)

            logprob = tf.reduce_mean(logprob, axis=-1, keepdims=True)
            new_Q_value = tf.math.minimum(self.Q_network_1([state, sample_action]), self.Q_network_2([state, sample_action]))

            policy_loss = tf.reduce_mean(np.exp(self.log_aplha) * logprob - new_Q_value)

        policy_network_grad = tape.gradient(policy_loss, self.policy_network.trainable_variables)
        self.policy_OPT.apply_gradients(zip(policy_network_grad, self.policy_network.trainable_variables))

        with tf.GradientTape() as tape:
            alpha_loss = - tf.exp(self.log_aplha) * (tf.reduce_mean(tf.exp(logprob) * logprob) + self.mini_entropy)
        alpha_grad = tape.gradient(alpha_loss, [self.log_aplha])
        self.alpha_OPT.apply_gradients(zip(alpha_grad, [self.log_aplha]))

        with tf.GradientTape() as tape:
            value = self.value_network(state)
            value_ = tf.stop_gradient(new_Q_value - np.exp(self.log_aplha) * logprob)
            value_loss = tf.reduce_mean(losses.mean_squared_error(value_, value))
        value_network_grad = tape.gradient(value_loss, self.value_network.trainable_variables)
        self.value_OPT.apply_gradients(zip(value_network_grad, self.value_network.trainable_variables))

        target_value = tf.stop_gradient(self.target_value_network(next_state))
        Q_ = reward + self.gamma * (1 - done) * target_value

        with tf.GradientTape() as tape:
            Q_1 = self.Q_network_1([state, action])
            Q_1_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_1))
        Q_network_1_grad = tape.gradient(Q_1_loss, self.Q_network_1.trainable_variables)
        self.Q1_OPT.apply_gradients(zip(Q_network_1_grad, self.Q_network_1.trainable_variables))

        with tf.GradientTape() as tape:
            Q_2 = self.Q_network_2([state, action])
            Q_2_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_2))
        Q_network_2_grad = tape.gradient(Q_2_loss, self.Q_network_2.trainable_variables)
        self.Q2_OPT.apply_gradients(zip(Q_network_2_grad, self.Q_network_2.trainable_variables))

        self._update_target_value_network()
        return (
            np.array(Q_1_loss, dtype=np.float64),
            np.array(Q_2_loss, dtype=np.float64),
            np.array(policy_loss, dtype=np.float64),
            np.array(value_loss, dtype=np.float64),
            np.array(alpha_loss, dtype=np.float64),
            np.exp(self.log_aplha)
        )

    def save_weights(self, path):
        self.policy_network.save_weights(path + '-policy_network.h5')
        self.value_network.save_weights(path + '-value_network.h5')
        self.Q_network_1.save_weights(path + '-Q_network_1.h5')
        self.Q_network_2.save_weights(path + '-Q_network_2.h5')

    def load_weights(self, path):
        self.policy_network.load_weights(path + '-policy_network.h5')
        self.value_network.load_weights(path + '-value_network.h5')
        self.Q_network_1.load_weights(path + '-Q_network_1.h5')
        self.Q_network_2.load_weights(path + '-Q_network_2.h5')

if __name__ == '__main__':
    RENDER = False
    EPISODES = 2000
    BATCH_SIZE = 256
    env = gym.make('LunarLanderContinuous-v2')
    agent = SoftActorCritic((8), 2)
    # agent.load_weights('./LunarLanderContinuous-v2')
    loss_list = []
    reward_list = []
    _100_window_reward_list = []
    f = open('log.txt', 'w')
    for e in range(EPISODES):
        state = env.reset()
        rewards = 0
        while True:
            if RENDER:
                env.render()
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards += reward
            agent.save_memory(state, action, reward, next_state, done)
            state = copy.deepcopy(next_state)

            if done: break

        Q1_loss, Q2_loss, policy_loss, value_loss, alpha_loss, alpha = agent.update_weights(BATCH_SIZE)
        loss_list.append(np.sum([Q1_loss, Q2_loss, policy_loss, value_loss]))
        reward_list.append(rewards)
        _100_window_reward = sum(reward_list[-100:]) / len(reward_list[-100:])
        _100_window_reward_list.append(_100_window_reward)
        log = """
        ==============================================================================
        |>episode: {}/{}
        |>memory length: {}
        |>losses 
        |    >>
        |        Q1_loss: {}, Q2_loss: {}, 
        |        policy_loss: {}, value_loss: {}
        |        alpha: {}, alpha_loss: {}
        |    << 
        |>score: {}, avg score: {}
        ==============================================================================
        """.format(
            e + 1, EPISODES, len(agent.replay_buffer),
            Q1_loss, Q2_loss, policy_loss, value_loss, alpha, alpha_loss,
            rewards, _100_window_reward
        )
        f.write(log)
        print("episode: {}/{}, score: {}, avg_score: {}".format(e+1, EPISODES, rewards, _100_window_reward))
        agent.save_weights('./LunarLanderContinuous-v2')
    f.close()
    plt.plot(reward_list)
    plt.plot(_100_window_reward_list)
    plt.show()
    plt.plot(loss_list)
    plt.show()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211

reward 曲线

请添加图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/644319
推荐阅读
相关标签
  

闽ICP备14008679号