Gymnasium 强化学习环境库教程
1. 项目介绍
Gymnasium是OpenAI Gym的官方继任者,由Farama Foundation维护和开发,旨在为强化学习研究和开发提供标准化的环境接口。它保留了OpenAI Gym的核心功能,同时添加了许多新特性和改进,包括更好的文档、更一致的API和更多的环境。
- GitHub链接:https://github.com/Farama-Foundation/Gymnasium
- Star数量:6.5k+
- 主要功能:
- 提供多种强化学习环境
- 与OpenAI Gym兼容的接口
- 支持多种环境类型:Atari、棋盘游戏、连续控制等
- 标准化的环境接口
- 详细的文档和示例
- 支持自定义环境
- 更好的类型提示和文档
- 更一致的API设计
2. 安装指南
2.1 系统要求
- Python 3.7+
- 支持的操作系统:Linux, macOS, Windows
2.2 安装步骤
- 安装Gymnasium:
pip install gymnasium- 安装特定环境的依赖:
# 安装所有环境的依赖
pip install "gymnasium[all]"
# 或者安装特定环境的依赖
pip install "gymnasium[atari]" # Atari环境
pip install "gymnasium[box2d]" # Box2D环境
pip install "gymnasium[classic-control]" # 经典控制环境
pip install "gymnasium[mujoco]" # MuJoCo环境- 验证安装:
python -c "import gymnasium as gym; print('Gymnasium installed successfully')"3. 核心概念
3.1 环境(Environment)
环境是智能体与之交互的外部世界,Gymnasium提供了多种环境,每种环境都有其特定的规则和状态空间。
3.2 智能体(Agent)
智能体是在环境中行动的实体,通过与环境交互来学习最优策略。
3.3 观察空间(Observation Space)
观察空间定义了智能体可以接收到的环境信息的类型和范围。
3.4 动作空间(Action Space)
动作空间定义了智能体可以执行的动作的类型和范围。
3.5 奖励(Reward)
奖励是环境给予智能体的反馈信号,智能体的目标是最大化累积奖励。
3.6 回合(Episode)
回合是指从环境重置到终止的完整交互过程。
4. 基本使用
4.1 基本环境交互
import gymnasium as gym
# 创建环境
env = gym.make("CartPole-v1")
# 重置环境
observation, info = env.reset(seed=42)
# 与环境交互
for _ in range(1000):
action = env.action_space.sample() # 随机选择动作
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close()4.2 环境信息
import gymnasium as gym
# 创建环境
env = gym.make("CartPole-v1")
# 重置环境
env.reset()
# 获取环境信息
print(f"环境名称: {env.unwrapped.spec.id}")
print(f"观察空间: {env.observation_space}")
print(f"动作空间: {env.action_space}")
print(f"最大步数: {env.unwrapped.spec.max_episode_steps}")4.3 环境渲染
import gymnasium as gym
# 创建环境(启用渲染)
env = gym.make("CartPole-v1", render_mode="human")
# 重置环境
observation, info = env.reset(seed=42)
# 与环境交互
for _ in range(1000):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close()4.4 保存和加载环境
import gymnasium as gym
import pickle
# 创建环境
env = gym.make("CartPole-v1")
# 重置环境
observation, info = env.reset(seed=42)
# 保存环境状态
state = env.unwrapped.state
with open("env_state.pkl", "wb") as f:
pickle.dump(state, f)
# 加载环境状态
with open("env_state.pkl", "rb") as f:
loaded_state = pickle.load(f)
env.unwrapped.state = loaded_state5. 高级功能
5.1 自定义环境
import gymnasium as gym
from gymnasium import spaces
import numpy as np
class CustomEnv(gym.Env):
def __init__(self):
super().__init__()
# 定义观察空间
self.observation_space = spaces.Box(low=0, high=10, shape=(1,), dtype=np.float32)
# 定义动作空间
self.action_space = spaces.Discrete(2)
# 初始化状态
self.state = 5.0
def reset(self, seed=None, options=None):
# 重置状态
self.state = 5.0
# 返回初始观察和信息
return np.array([self.state]), {}
def step(self, action):
# 执行动作
if action == 0:
self.state -= 1
else:
self.state += 1
# 计算奖励
if self.state == 10:
reward = 10
terminated = True
elif self.state == 0:
reward = -10
terminated = True
else:
reward = -1
terminated = False
# 截断(如果需要)
truncated = False
# 信息
info = {}
return np.array([self.state]), reward, terminated, truncated, info
# 使用自定义环境
env = CustomEnv()
observation, info = env.reset()
for _ in range(100):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
print(f"Observation: {observation}, Reward: {reward}, Terminated: {terminated}")
if terminated or truncated:
observation, info = env.reset()
env.close()5.2 环境包装器
import gymnasium as gym
from gymnasium.wrappers import RescaleAction, TimeLimit
# 创建环境
env = gym.make("Pendulum-v1")
# 应用包装器
env = RescaleAction(env, min_action=-1.0, max_action=1.0) # 重缩放动作空间
env = TimeLimit(env, max_episode_steps=100) # 设置最大步数
# 与环境交互
observation, info = env.reset()
for _ in range(100):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close()5.3 向量化环境
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv
# 创建多个环境
envs = SyncVectorEnv([
lambda: gym.make("CartPole-v1") for _ in range(4)
])
# 重置环境
observations, infos = envs.reset(seed=42)
# 与环境交互
for _ in range(100):
actions = envs.action_space.sample() # 为每个环境选择动作
observations, rewards, terminateds, truncateds, infos = envs.step(actions)
# 检查是否所有环境都完成
if all(terminateds) or all(truncateds):
observations, infos = envs.reset()
envs.close()5.4 环境注册
import gymnasium as gym
from gymnasium import spaces
import numpy as np
class CustomEnv(gym.Env):
def __init__(self):
super().__init__()
self.observation_space = spaces.Box(low=0, high=10, shape=(1,), dtype=np.float32)
self.action_space = spaces.Discrete(2)
self.state = 5.0
def reset(self, seed=None, options=None):
self.state = 5.0
return np.array([self.state]), {}
def step(self, action):
if action == 0:
self.state -= 1
else:
self.state += 1
if self.state == 10:
reward = 10
terminated = True
elif self.state == 0:
reward = -10
terminated = True
else:
reward = -1
terminated = False
truncated = False
info = {}
return np.array([self.state]), reward, terminated, truncated, info
# 注册环境
gym.register(
id="CustomEnv-v0",
entry_point=CustomEnv,
max_episode_steps=100,
)
# 使用注册的环境
env = gym.make("CustomEnv-v0")
observation, info = env.reset()
for _ in range(100):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close()6. 实用案例
6.1 使用Gymnasium环境训练强化学习模型
场景:使用Gymnasium的CartPole环境训练强化学习模型
实现:
import gymnasium as gym
import numpy as np
# 创建环境
env = gym.make("CartPole-v1")
# 训练参数
num_episodes = 1000
max_steps = 200
learning_rate = 0.1
discount_factor = 0.99
# 初始化Q表(离散化状态空间)
states = np.linspace(-4.8, 4.8, 10) # 车的位置
states = np.append(states, np.linspace(-4, 4, 10)) # 车的速度
states = np.append(states, np.linspace(-0.418, 0.418, 10)) # 杆的角度
states = np.append(states, np.linspace(-4, 4, 10)) # 杆的角速度
# 创建Q表
q_table = np.zeros((len(states), len(states), len(states), len(states), env.action_space.n))
# 训练循环
for episode in range(num_episodes):
observation, info = env.reset(seed=episode)
episode_reward = 0
for step in range(max_steps):
# 离散化状态
state_idx = tuple(np.digitize(obs, states) for obs in observation)
# ε-贪婪策略
if np.random.uniform(0, 1) < 0.1:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state_idx])
# 执行动作
next_observation, reward, terminated, truncated, info = env.step(action)
next_state_idx = tuple(np.digitize(obs, states) for obs in next_observation)
# 更新Q表
best_next_action = np.argmax(q_table[next_state_idx])
q_table[state_idx + (action,)] += learning_rate * (
reward + discount_factor * q_table[next_state_idx + (best_next_action,)] - q_table[state_idx + (action,)]
)
observation = next_observation
episode_reward += reward
if terminated or truncated:
break
if episode % 100 == 0:
print(f"Episode {episode}: Reward = {episode_reward}")
env.close()6.2 使用Gymnasium与Stable Baselines3一起训练
场景:使用Gymnasium的环境和Stable Baselines3的PPO算法训练强化学习模型
实现:
import gymnasium as gym
from stable_baselines3 import PPO
# 创建环境
env = gym.make("CartPole-v1")
# 训练PPO模型
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)
# 测试模型
env = gym.make("CartPole-v1", render_mode="human")
observation, info = env.reset()
total_reward = 0
for _ in range(1000):
action, _states = model.predict(observation)
observation, reward, terminated, truncated, info = env.step(action)
total_reward += reward
if terminated or truncated:
print(f"Total reward: {total_reward}")
total_reward = 0
observation, info = env.reset()
env.close()6.3 使用Gymnasium创建自定义环境
场景:创建一个简单的迷宫环境
实现:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
class MazeEnv(gym.Env):
def __init__(self):
super().__init__()
# 定义迷宫(0:空地,1:墙壁,2:终点)
self.maze = np.array([
[0, 1, 0, 0, 2],
[0, 1, 0, 1, 0],
[0, 0, 0, 1, 0],
[1, 1, 0, 1, 0],
[0, 0, 0, 0, 0]
])
# 定义观察空间(位置)
self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.int32)
# 定义动作空间(上、下、左、右)
self.action_space = spaces.Discrete(4)
# 动作映射
self.actions = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 上、下、左、右
def reset(self, seed=None, options=None):
# 从起点(0,0)开始
self.position = (0, 0)
return np.array(self.position), {}
def step(self, action):
# 计算新位置
new_position = (self.position[0] + self.actions[action][0],
self.position[1] + self.actions[action][1])
# 检查是否在迷宫范围内
if (new_position[0] < 0 or new_position[0] >= 5 or
new_position[1] < 0 or new_position[1] >= 5):
# 撞墙
reward = -1
terminated = False
new_position = self.position
else:
# 检查是否是墙壁
if self.maze[new_position[0], new_position[1]] == 1:
# 撞墙
reward = -1
terminated = False
new_position = self.position
elif self.maze[new_position[0], new_position[1]] == 2:
# 到达终点
reward = 10
terminated = True
else:
# 空地
reward = -0.1
terminated = False
self.position = new_position
truncated = False
info = {}
return np.array(self.position), reward, terminated, truncated, info
# 使用自定义环境
env = MazeEnv()
observation, info = env.reset()
for _ in range(100):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
print(f"Position: {observation}, Reward: {reward}, Terminated: {terminated}")
if terminated or truncated:
observation, info = env.reset()
env.close()7. 性能优化
7.1 环境优化
- 使用向量化环境(VectorEnv)并行运行多个环境
- 对于不需要渲染的训练,关闭渲染
- 选择合适的环境包装器,如TimeLimit、RescaleAction等
7.2 算法优化
- 根据环境类型选择合适的算法(离散动作空间:DQN、PPO;连续动作空间:A2C、PPO)
- 调整算法超参数,如学习率、批量大小、gamma等
- 使用适当的网络架构,如CNN用于图像输入
7.3 计算优化
- 使用GPU加速训练
- 调整batch_size和nsteps以充分利用GPU
- 使用多进程并行训练
- 对于大型模型,考虑使用梯度裁剪和学习率调度
8. 常见问题与解决方案
8.1 环境重置问题
问题:环境重置后状态不正确
解决方案:
- 确保在reset方法中正确初始化所有状态变量
- 使用seed参数确保可重复性
8.2 动作空间不匹配
问题:智能体的动作不在动作空间范围内
解决方案:
- 在选择动作前检查动作空间的边界
- 使用RescaleAction包装器重缩放动作空间
8.3 训练速度慢
问题:训练过程速度慢,迭代时间长
解决方案:
- 使用向量化环境并行训练
- 使用GPU加速
- 调整批量大小和学习率
- 对于大型环境,考虑使用更高效的预处理
8.4 内存不足
问题:训练过程中内存不足
解决方案:
- 减少并行环境的数量
- 减小批量大小
- 使用更小的网络架构
9. 总结
Gymnasium作为OpenAI Gym的官方继任者,为强化学习研究和开发提供了标准化的环境接口。它不仅保留了OpenAI Gym的核心功能,还添加了许多新特性和改进,使得强化学习环境的使用更加便捷和高效。
通过本教程的学习,您应该能够:
- 理解Gymnasium的核心概念和功能
- 成功安装和配置Gymnasium
- 使用不同的强化学习环境
- 自定义强化学习环境
- 与其他强化学习库集成
- 优化训练性能
- 解决常见问题
Gymnasium的设计理念是提供一个统一的强化学习环境接口,使得不同的强化学习算法可以在相同的环境上进行比较和评估。它的出现为强化学习研究和应用提供了重要的工具支持,是现代强化学习开发的必备工具之一。