赞
踩
多智能体深度确定性策略梯度(Multi-Agent Deep Deterministic Policy Gradient, MADDPG)算法是一种在多智能体环境中使用的强化学习算法。这种算法是基于深度确定性策略梯度(DDPG)算法的扩展。MADDPG主要用于解决多智能体环境中的协作和竞争问题,特别是在智能体之间的交互可能非常复杂的情况下。下面将详细介绍MADDPG算法的核心概念和工作原理。
在介绍MADDPG之前,需要理解其基础——DDPG算法。DDPG是一种结合了深度学习和强化学习的算法,用于连续动作空间的问题。DDPG使用了策略梯度方法和Q学习(一种值函数近似方法)的结合,通过学习一个确定性策略来解决复杂的决策问题。
MADDPG考虑了多智能体环境的动态性和复杂性。在多智能体环境中,每个智能体的行为不仅取决于环境的状态,还受到其他智能体策略的影响。MADDPG通过对每个智能体采用一个独立的Actor-Critic架构,并在训练过程中考虑其他智能体的策略信息,来改善学习效果和稳定性。
初始化: 对于每个智能体i: 初始化actor网络π_i和critic网络Q_i,以及它们的目标网络π'_i和Q'_i。 初始化经验回放缓冲区D。 重复(对于每个episode): 初始化环境状态S 重复(对于每个时间步): 对于每个智能体i: 根据当前策略π_i和状态S观察o_i,选择动作a_i 执行所有智能体的动作[a_1, ..., a_N],观察新状态S'和奖励R 对于每个智能体i,将转换(t = (o_i, a_i, R_i, o'_i))存储到D中 对于每个智能体i: 从D中随机采样一批转换 对于每个采样的转换: 使用目标网络计算目标Q值 更新critic网络Q_i,最小化损失:L = (Q_i(o_i, a_i) - 目标Q)^2 更新actor网络π_i,使用策略梯度 对于每个智能体i: 软更新目标网络参数:π'_i ← τπ_i + (1 - τ)π'_i Q'_i ← τQ_i + (1 - τ)Q'_i 直到环境结束本episode 重复,直到满足终止条件
MADDPG(多智能体深度确定性策略梯度)算法是多智能体强化学习领域的一个重要算法,它针对的是连续动作空间问题,并且特别适用于环境中存在多个智能体互动(合作、竞争或两者兼有)的情况。以下是MADDPG与其他几种多智能体强化学习算法的比较:
MADDPG适用于各种多智能体场景,包括但不限于:
环境介绍:simple_adversary_v3
这是一个合作与竞争的环境。
环境基本信息
agent.silent
,等于True就是没有信息传递(保持安静))智能体观测信息:(observation)
if agent.adversary == True:
一个numpy.array,[地标距离自己的距离(4),其他智能体距自己的距离(4)]if agent.adversary == False:
一个numpy.array,[智能体自己的坐标(2),智能体距目标的相对距离(2),地标距离自己的距离(2),其他智能体距自己的距离(4)]奖励函数
main.py
""" #!/usr/bin/env python # -*- coding:utf-8 -*- @Project : MADDPG """ from pettingzoo.mpe import simple_adversary_v3 import numpy as np import torch import torch.nn as nn import os import time from maddpg_agent import Agent torch.autograd.set_detect_anomaly(True) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device:{device}") def multi_obs_to_state(multi_obs): state= np.array([]) for agent_obs in multi_obs.values(): state = np.concatenate([state, agent_obs]) return state NUM_EPISODE = 1000 NUM_STEP = 100 MEMORY_SIZE = 10000 BATCH_SIZE = 512 TARGET_UPDATE_INTERVAL = 200 LR_ACTOR = 0.001 LR_CRITIC = 0.001 HIDDEN_DIM = 64 GAMMA = 0.99 TAU = 0.01 scenario = "simple_adversary_v3" current_path = os.path.dirname(os.path.realpath(__file__)) agent_path = current_path + "\\" +"models"+ "\\" + scenario + "\\" timestamp = time.strftime("%Y%m%d%H%M%S") # 1. initialize the agent # 初始化环境 env = simple_adversary_v3.parallel_env(N=2, max_cycles= NUM_STEP, continuous_actions= True) multi_obs, infos = env.reset() NUM_AGENT = env.num_agents agent_name_list = env.agents # 1.1 get obs_dim obs_dim = [] for agent_obs in multi_obs.values(): obs_dim.append(agent_obs.shape[0]) state_dim = sum(obs_dim) # 1.2 get action_dim action_dim = [] for agent_name in agent_name_list: action_dim.append(env.action_space(agent_name).sample().shape[0]) agents=[] # 实例化多个智能体 for agent_i in range(NUM_AGENT): print(f"Initializing agent {agent_i}.....") agent = Agent( memo_size=MEMORY_SIZE, obs_dim=obs_dim[agent_i], state_dim= state_dim, n_agent = NUM_AGENT, action_dim = action_dim[agent_i], alpha=LR_ACTOR ,beta= LR_CRITIC, fc1_dims = HIDDEN_DIM, fc2_dims=HIDDEN_DIM, gamma = GAMMA, tau=TAU , batch_size=BATCH_SIZE) agents.append(agent) # 2. Main training loop for episode_i in range(NUM_EPISODE): multi_obs, infos = env.reset() episode_reward = 0 mlti_done = {agent_name:False for agent_name in agent_name_list} for step_i in range(NUM_STEP): total_step = episode_i*NUM_STEP+step_i # 2.1 collecting action from all agents multi_actions ={} # 用于存储动作集合 for agent_i, agent_name in enumerate(agent_name_list): agent = agents[agent_i] single_obs = multi_obs[agent_name] single_action = agent.get_action(single_obs) multi_actions[agent_name] = single_action # 2.2 executing actions, multi_next_obs, multi_reward, multi_done, multi_truncations, infos = env.step(multi_actions) state= multi_obs_to_state(multi_obs) next_state = multi_obs_to_state(multi_next_obs) if step_i >= NUM_STEP -1: multi_done = {agent_name: True for agent_name in agent_name_list} #2.3 store memory for agent_i, agent_name in enumerate(agent_name_list): agent = agents[agent_i] single_obs = multi_obs[agent_name] single_next_obs = multi_next_obs[agent_name] single_action = multi_actions[agent_name] single_reward = multi_reward[agent_name] single_done = multi_done[agent_name] # 存储到经验池中 agent.replay_buffer.add_memo(single_obs,single_next_obs,state, next_state, single_action,single_reward,single_done) #2.4 Update brain every fixed step multi_batch_obses=[] multi_batch_next_obses =[] multi_batch_states = [] multi_batch_next_states = [] multi_batch_actions = [] multi_batch_next_actions =[] multi_batch_online_actions =[] multi_batch_rewards =[] multi_batch_dones = [] #2.4.1 sample a batch of memories current_memo_size = min (MEMORY_SIZE, total_step+1) if current_memo_size < BATCH_SIZE: batch_idx = range(0, current_memo_size) else: batch_idx = np.random.choice(current_memo_size,BATCH_SIZE) for agent_i in range(NUM_AGENT): agent = agents[agent_i] batch_obses, batch_next_obses, batch_states,batch_next_state, batch_actions,batch_rewards, batch_dones = agent.replay_buffer.sample(batch_idx) batch_obses_tensor = torch.tensor(batch_obses,dtype=torch.float).to(device) batch_next_obses_tensor = torch.tensor(batch_next_obses,dtype=torch.float).to(device) batch_states_tensor = torch.tensor(batch_states,dtype=torch.float).to(device) batch_next_state_tensor = torch.tensor(batch_next_state,dtype=torch.float).to(device) batch_actions_tensor = torch.tensor(batch_actions,dtype=torch.float).to(device) batch_rewards_tensor = torch.tensor(batch_rewards,dtype=torch.float).to(device) batch_done_tensor = torch.tensor(batch_dones,dtype=torch.float).to(device) multi_batch_obses.append(batch_obses_tensor) multi_batch_next_obses.append(batch_next_obses_tensor) multi_batch_states.append(batch_states_tensor) multi_batch_next_states.append(batch_next_state_tensor) multi_batch_actions.append(batch_actions_tensor) single_batch_next_actions = agent.target_actor.forward(batch_next_obses_tensor) multi_batch_next_actions.append(single_batch_next_actions) single_batch_online_action = agent.actor.forward(batch_obses_tensor) multi_batch_online_actions.append(single_batch_online_action) multi_batch_rewards.append(batch_rewards_tensor) multi_batch_dones.append(batch_done_tensor) multi_batch_actions_tensor = torch.cat(multi_batch_actions, dim=1).to(device) multi_batch_next_actions_tensor = torch.cat(multi_batch_next_actions, dim=1).to(device) multi_batch_online_actions_tensor = torch.cat(multi_batch_online_actions, dim=1).to(device) if(total_step+1) % TARGET_UPDATE_INTERVAL == 0: for agent_i in range(NUM_AGENT): agent = agents[agent_i] batch_obses_tensor = multi_batch_obses[agent_i] batch_states_tensor = multi_batch_states[agent_i] batch_next_states_tensor = multi_batch_next_states[agent_i] batch_rewards_tensor =multi_batch_rewards [agent_i] batch_dones_tensor =multi_batch_dones [agent_i] batch_actions_tensor =multi_batch_actions [agent_i] #target critic critic_target_q = agent.target_critic.forward(batch_next_state_tensor ,multi_batch_next_actions_tensor.detach()) y = (batch_rewards_tensor + (1-batch_dones_tensor)*agent.gamma*critic_target_q).flatten() critic_q = agent.critic.forward(batch_states_tensor,multi_batch_actions_tensor.detach()).flatten() #update critic critic_loss = nn.MSELoss()(y,critic_q) agent.critic.optimizer.zero_grad() critic_loss.backward() agent.critic.optimizer.step() #update actor actor_q = agent.critic.forward(batch_states_tensor, multi_batch_online_actions_tensor.detach()).flatten() actor_loss = -torch.mean(actor_q) agent.actor.optimizer.zero_grad() actor_loss.backward() agent.actor.optimizer.step() # update target critic for target_param, param in zip(agent.target_critic.parameters(), agent.critic.parameters()): target_param.data.copy_(agent.tau * param.data+(1.0-agent.tau)*target_param.data) # update target actor for target_param, param in zip(agent.target_actor.parameters(), agent.actor.parameters()): target_param.data.copy_(agent.tau * param.data+(1.0-agent.tau)*target_param.data) multi_obs = multi_next_obs episode_reward += sum([single_reward for single_reward in multi_reward.values()]) print(f"episode reward :{episode_reward}") # 3.Render the env if(episode_i +1) % 50 == 0: env= simple_adversary_v3.parallel_env(N=2, max_cycles=NUM_STEP, continuous_actions = True, render_mode = "human") for test_epi_i in range(2): multi_obs, infos = env.reset() for step_i in range(NUM_STEP): multi_actions={} for agent_i, agent_name in enumerate(agent_name_list): agent = agents[agent_i] single_obs = multi_obs[agent_name] single_action = agent.get_action(single_obs) multi_actions[agent_name] = single_action multi_next_obs, multi_reward, multi_done, multi_truncations, infos = env.step(multi_actions) multi_obs = multi_next_obs # Save the agents if episode_i == 0: highest_reward = episode_reward if episode_reward >highest_reward: highest_reward=episode_reward print(f"Highest reward update at episode {episode_i}:{round(highest_reward,2)}") for agent_i in range(NUM_AGENT): agent = agents[agent_i] flag = os.path.exists(agent_path) if not flag: os.makedirs(agent_path) torch.save(agent.actor.state_dict(),f"models"+"\\"+"simple_adversary_v3"+"\\"+f"agent_{agent_i}_actor_{scenario}_{timestamp}.pth") env.close()
maddpg_agent.py
""" #!/usr/bin/env python # -*- coding:utf-8 -*- @Project : MADDPG """ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") class ReplayBuffer: def __init__(self, capcity, obs_dim, state_dim, action_dim, batch_size): self.capcity = capcity self.obs_cap = np.empty((self.capcity,obs_dim)) self.next_obs_cap = np.empty((self.capcity,obs_dim)) self.state_cap = np.empty((self.capcity,state_dim)) self.next_state_cap = np.empty((self.capcity,state_dim)) self.action_cap = np.empty((self.capcity,action_dim)) self.reward_cap = np.empty((self.capcity,1)) self.done_cap = np.empty((self.capcity,1)) self.batch_batch = batch_size self.current = 0 def add_memo(self, obs, next_obs, state, next_state, action, reward, done): self.obs_cap[self.current] =obs self.next_obs_cap[self.current] =next_obs self.state_cap[self.current] =state self.next_state_cap[self.current] =next_state self.action_cap[self.current] =action self.reward_cap[self.current] =reward self.done_cap[self.current] =done self.current = (self.current + 1) % self.capcity #get one sample def sample(self,idxes): obs = self.obs_cap[idxes] next_obs = self.next_obs_cap[idxes] state = self.state_cap[idxes] next_state = self.next_state_cap[idxes] action = self.action_cap[idxes] reward = self.reward_cap[idxes] done = self.done_cap[idxes] return obs,next_obs,state,next_state,action,reward,done class Critic(nn.Module): def __init__(self, lr_critic, input_dims, fc1_dims, fc2_dims,n_agent,action_dim): super(Critic, self).__init__() self.fc1 = nn.Linear(input_dims+n_agent*action_dim,fc1_dims) self.fc2 = nn.Linear(fc1_dims,fc2_dims) self.q = nn.Linear(fc2_dims, 1) self.optimizer = torch.optim.Adam(self.parameters(),lr=lr_critic) def forward(self, state,action): x= torch.cat([state,action],dim=1) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) q = self.q(x) return q def save_checkpoint(self, checkpoint_file): torch.save(self.state_dict(), checkpoint_file) def load_checkpoint(self, checkpoint_file): self.load_state_dict(torch.load(checkpoint_file)) class Actor(nn.Module): def __init__(self, lr_actor, input_dims, fc1_dims, fc2_dims,action_dim): super(Actor, self).__init__() self.fc1 = nn.Linear(input_dims,fc1_dims) self.fc2 = nn.Linear(fc1_dims,fc2_dims) self.pi = nn.Linear(fc2_dims, action_dim) self.optimizer = torch.optim.Adam(self.parameters(),lr=lr_actor) def forward(self, state): x = F.relu((self.fc1(state))) x = F.relu((self.fc2(x))) mu = torch.softmax(self.pi(x), dim=1) return mu def save_checkpoint(self, checkpoint_file): torch.save(self.state_dict(), checkpoint_file) def load_checkpoint(self, checkpoint_file): self.load_state_dict(torch.load(checkpoint_file)) class Agent: def __init__(self, memo_size, obs_dim, state_dim, n_agent, action_dim, alpha ,beta, fc1_dims, fc2_dims, gamma, tau , batch_size): self.gamma = gamma self.tau = tau self.action_dim = action_dim self.actor = Actor(lr_actor=alpha, input_dims=obs_dim, fc1_dims=fc1_dims, fc2_dims=fc2_dims, action_dim=action_dim).to(device) self.critic = Critic(lr_critic=beta, input_dims=state_dim, fc1_dims=fc1_dims, fc2_dims=fc2_dims, n_agent=n_agent,action_dim=action_dim).to(device) self.target_actor = Actor(lr_actor=alpha, input_dims=obs_dim, fc1_dims=fc1_dims, fc2_dims=fc2_dims, action_dim=action_dim).to(device) self.target_critic = Critic(lr_critic=beta, input_dims=state_dim, fc1_dims=fc1_dims, fc2_dims=fc2_dims, n_agent=n_agent, action_dim=action_dim).to(device) self.replay_buffer = ReplayBuffer(capcity=memo_size, obs_dim=obs_dim, state_dim=state_dim, action_dim=action_dim, batch_size=batch_size) def get_action(self, obs): single_obs = torch.tensor(data=obs, dtype=torch.float).unsqueeze(0).to(device) single_action = self.actor.forward(single_obs) noise = torch.randn(self.action_dim).to(device)*0.2 single_action = torch.clamp(input=single_action+noise, min=0.0, max=1.0) return single_action.detach().cpu().numpy()[0] def save_model(self,filename): self.actor.save_checkpoint(filename) self.target_actor.save_checkpoint(filename) self.critic.save_checkpoint(filename) self.target_critic.save_checkpoint(filename) def load_model(self,filename): self.actor.load_checkpoint(filename) self.target_actor.load_checkpoint(filename) self.critic.load_checkpoint(filename) self.target_critic.load_checkpoint(filename)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。