Python OpenAI Gym 中级教程:多智能体系统
在强化学习中,多智能体系统涉及到多个智能体相互作用的情况。在本篇博客中,我们将介绍如何在 OpenAI Gym 中构建和训练多智能体系统,并使用 Multi-Agent Deep Deterministic Policy Gradients(MADDPG)算法进行协同训练。
1. 安装依赖
首先,确保你已经安装了 OpenAI Gym 和其他必要的依赖:
pip install gym
pip install numpy
pip install tensorflow
pip install matplotlib
2. 多智能体环境
我们将以一个简单的多智能体环境为例,该环境称为 MultiAgentEnv,其中包含两个智能体,它们分别控制两辆小车,目标是使两辆小车在一个二维平面上协同移动,避免相互碰撞。
import gym
from gym import spaces
import numpy as npclass MultiAgentEnv(gym.Env):def __init__(self):super(MultiAgentEnv, self).__init__()# 定义动作空间和观察空间self.action_space = spaces.Discrete(5) # 5个离散动作self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32) # 连续观察空间,包含4个状态# 初始化两辆小车的状态self.agent1_state = np.array([0.2, 0.5, 0.0, 0.0])self.agent2_state = np.array([0.8, 0.5, 0.0, 0.0])def reset(self):# 重置环境,将两辆小车放置在初始位置self.agent1_state = np.array([0.2, 0.5, 0.0, 0.0])self.agent2_state = np.array([0.8, 0.5, 0.0, 0.0])return np.concatenate([self.agent1_state, self.agent2_state])def step(self, actions):# 执行动作,更新两辆小车的状态并返回奖励和观察结果self.agent1_state[0] += 0.1 * (actions[0] - 2)self.agent1_state[1] += 0.1 * (actions[1] - 2)self.agent2_state[0] += 0.1 * (actions[2] - 2)self.agent2_state[1] += 0.1 * (actions[3] - 2)# 规定状态范围在 [0, 1] 之间self.agent1_state[:2] = np.clip(self.agent1_state[:2], 0, 1)self.agent2_state[:2] = np.clip(self.agent2_state[:2], 0, 1)# 计算奖励reward1 = -np.linalg.norm(self.agent1_state[:2] - self.agent2_state[:2])reward2 = -np.linalg.norm(self.agent2_state[:2] - self.agent1_state[:2])# 返回观察结果、奖励、是否终止和其他信息return np.concatenate([self.agent1_state, self.agent2_state]), [reward1, reward2], False, {}
3. MADDPG 算法
接下来,我们将实现 MADDPG 算法。为了简化,我们将只实现两个智能体的情况。
import tensorflow as tf
from tensorflow.keras import layersclass ActorCritic(tf.keras.Model):def __init__(self, num_actions):super(ActorCritic, self).__init__()# 定义Actor网络self.actor_fc1 = layers.Dense(64, activation='relu')self.actor_fc2 = layers.Dense(64, activation='relu')self.actor_output = layers.Dense(num_actions, activation='softmax')# 定义Critic网络self.critic_fc1 = layers.Dense(64, activation='relu')self.critic_fc2 = layers.Dense(64, activation='relu')self.critic_output = layers.Dense(1, activation='linear')def call(self, state):# Actor网络输出动作概率actor_x = self.actor_fc1(state)actor_x = self.actor_fc2(actor_x)action_probs = self.actor_output(actor_x)# Critic网络输出状态值critic_x = self.critic_fc1(state)critic_x = self.critic_fc2(critic_x)state_value = self.critic_output(critic_x)return action_probs, state_value
4. 训练多智能体系统
现在,我们将使用 MADDPG 算法来训练多智能体系统。
def train_maddpg(env, model1, model2, optimizer1, optimizer2, num_episodes=1000, gamma=0.99):for episode in range(num_episodes):state = env.reset()state = tf.convert_to_tensor(state, dtype=tf.float32)total_reward1 = 0total_reward2 = 0with tf.GradientTape() as tape1, tf.GradientTape() as tape2:for t in range(1000): # 最多运行1000个时间步action_probs1, state_value1 = model1(state[None, :])action1 = tf.random.categorical(tf.math.log(action_probs1), 1)[0, 0]action_probs2, state_value2 = model2(state[None, :])action2 = tf.random.categorical(tf.math.log(action_probs2), 1)[0, 0]next_state, rewards, done, _ = env.step([action1.numpy(), action2.numpy()])next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)total_reward1 += rewards[0]total_reward2 += rewards[1]action_probs1_next, _ = model1(next_state[None, :])action_probs2_next, _ = model2(next_state[None, :])# 计算Advantage和Targetadvantage1 = rewards[0] + gamma * tf.reduce_max(action_probs1_next) - state_value1advantage2 = rewards[1] + gamma * tf.reduce_max(action_probs2_next) - state_value2target1 = rewards[0] + gamma * tf.reduce_max(action_probs1_next)target2 = rewards[1] + gamma * tf.reduce_max(action_probs2_next)# 计算Actor和Critic的损失loss_actor1 = -tf.math.log(action_probs1[0, action1]) * advantage1loss_actor2 = -tf.math.log(action_probs2[0, action2]) * advantage2loss_critic1 = tf.square(target1 - state_value1)loss_critic2 = tf.square(target2 - state_value2)# 计算总损失total_loss1 = loss_actor1 + loss_critic1total_loss2 = loss_actor2 + loss_critic2# 更新参数gradients1 = tape1.gradient(total_loss1, model1.trainable_variables)optimizer1.apply_gradients(zip(gradients1, model1.trainable_variables))gradients2 = tape2.gradient(total_loss2, model2.trainable_variables)optimizer2.apply_gradients(zip(gradients2, model2.trainable_variables))if episode % 10 == 0:print(f"Episode: {episode}, Total Reward Agent 1: {total_reward1}, Total Reward Agent 2: {total_reward2}")
5. 主函数
最后,我们将定义一个主函数来运行我们的多智能体系统。
if __name__ == "__main__":# 创建多智能体环境和模型env = MultiAgentEnv()model1 = ActorCritic(num_actions=5)model2 = ActorCritic(num_actions=5)# 创建优化器optimizer1 = tf.optimizers.Adam(learning_rate=0.001)optimizer2 = tf.optimizers.Adam(learning_rate=0.001)# 训练多智能体系统train_maddpg(env, model1, model2, optimizer1, optimizer2, num_episodes=500)
通过这个示例,我们演示了如何在 OpenAI Gym 中构建一个简单的多智能体环境,并使用 MADDPG 算法对多智能体系统进行协同训练。这个示例可以作为入门多智能体强化学习的起点,同时展示了 TensorFlow 和 OpenAI Gym 在多智能体环境中的基本应用。希望这篇博客对你理解和应用多智能体系统有所帮助。