Skip to main content
The environments module provides standardized interfaces for agents to interact with tasks, including single-turn and multi-turn scenarios.

BaseEnv

Abstract base class for all environments following the Gym interface.
from rllm.environments import BaseEnv

Properties

idx
Any
The index or identifier of the environment, often used within a batch.

Methods

reset

Resets the environment to an initial state.
def reset() -> tuple[dict, dict]
observation
dict
The initial observation.
info
dict
Auxiliary information.

step

Executes one time step within the environment.
def step(action: Any) -> tuple[Any, float, bool, dict]
action
Any
An action provided by the agent.
observation
Any
The next observation.
reward
float
The reward for this step.
done
bool
Whether the episode has ended.
info
dict
Additional information.

close

Performs any necessary cleanup.
def close() -> None

from_dict

Static factory method to create an environment from a dictionary.
@staticmethod
def from_dict(info: dict) -> BaseEnv
info
dict
Dictionary containing environment initialization data.

is_multithread_safe

Check if the environment can be used safely across multiple threads.
@staticmethod
def is_multithread_safe() -> bool
safe
bool
default:"True"
Whether the environment is thread-safe.

MultiTurnEnvironment

Environment for multi-turn interactions with language models.
from rllm.environments import MultiTurnEnvironment

class MyEnv(MultiTurnEnvironment):
    def get_reward_and_next_obs(self, task, action):
        # Compute reward logic here
        return reward, next_observation

Constructor

def __init__(
    task: dict | None = None,
    max_turns: int = 3,
    **kwargs
)
task
dict | None
Dictionary containing task information. Should include relevant fields for your specific task.
max_turns
int
default:"3"
Maximum number of turns before terminating the interaction.

Methods

reset

Reset the environment with a new task.
obs, info = env.reset(task=new_task)
task
dict | None
Optional task to set. If None, uses the current task.

step

Take a step in the environment.
next_obs, reward, done, info = env.step(action)
action
Any
Response string from the LLM or action object.

get_reward_and_next_obs

Abstract method to compute reward and next observation. Must be implemented by subclasses.
def get_reward_and_next_obs(
    task: dict,
    action: Any
) -> tuple[float, dict]
task
dict
The task dictionary containing relevant information.
action
Any
The action taken by the agent.
reward
float
The computed reward.
next_obs
dict
The next observation dictionary.

SingleTurnEnvironment

Simplified environment for single-turn interactions. This is a special case of MultiTurnEnvironment where max_turns=1.
from rllm.environments import SingleTurnEnvironment
from rllm.rewards import math_reward_fn

env = SingleTurnEnvironment(
    task={"question": "What is 2+2?", "answer": "4"},
    reward_fn=math_reward_fn
)

Constructor

def __init__(
    task: dict | None = None,
    reward_fn: RewardFunction | None = None,
    **kwargs
)
task
dict | None
Dictionary containing the task information, including at least a “question” field.
reward_fn
RewardFunction | None
Custom reward function to evaluate agent responses. If None, uses zero reward with a warning.

Methods

get_reward_and_next_obs

Compute the reward based on the task and action.
reward, next_obs = env.get_reward_and_next_obs(task, action)

from_dict

Create environment from dictionary.
env = SingleTurnEnvironment.from_dict({
    "task": {"question": "...", "answer": "..."},
    "reward_fn": my_reward_fn
})

Example: Custom Multi-Turn Environment

from rllm.environments import MultiTurnEnvironment
from rllm.rewards import RewardOutput

class ConversationEnv(MultiTurnEnvironment):
    def __init__(self, task=None, max_turns=5):
        super().__init__(task=task, max_turns=max_turns)
    
    def get_reward_and_next_obs(self, task, action):
        # Custom reward logic
        if "correct keyword" in action.lower():
            reward = 1.0
        else:
            reward = 0.0
        
        # Prepare next observation
        next_obs = {
            "turn": self.current_turn,
            "previous_action": action
        }
        
        return reward, next_obs
    
    @staticmethod
    def from_dict(env_args: dict):
        return ConversationEnv(
            task=env_args.get("task"),
            max_turns=env_args.get("max_turns", 5)
        )

# Usage
env = ConversationEnv(
    task={"goal": "Find the keyword"},
    max_turns=3
)

obs, info = env.reset()
for turn in range(3):
    action = "My response with correct keyword"
    next_obs, reward, done, info = env.step(action)
    print(f"Turn {turn}: reward={reward}, done={done}")

Example: Using SingleTurnEnvironment

from rllm.environments import SingleTurnEnvironment
from rllm.rewards import math_reward_fn

# Create environment with math reward function
env = SingleTurnEnvironment(
    task={
        "question": "Solve: 2x + 5 = 13",
        "answer": "x = 4"
    },
    reward_fn=math_reward_fn
)

# Reset and step
obs, info = env.reset()
action = "x = 4"  # Agent's response
next_obs, reward, done, info = env.step(action)

print(f"Reward: {reward}")  # 1.0 if correct
print(f"Done: {done}")      # True (single turn)

ToolEnvironment

Environment for agents that use tools, handling tool execution and response evaluation.
from rllm.environments import ToolEnvironment
Source: rllm/environments/tools/tool_env.py

Constructor

def __init__(
    task: dict | None = None,
    tools: list[str] | None = None,
    tool_map: dict[str, type[Tool]] | None = None,
    reward_fn: RewardFunction | None = None,
    max_steps: int = 10
)
task
dict | None
Task information dictionary. Typically includes “question” and “ground_truth” or “answer” fields.
tools
list[str] | None
List of tool names to load from the registry (e.g., ["python", "google_search"]). Mutually exclusive with tool_map.
tool_map
dict[str, type[Tool]] | None
Dictionary mapping tool names to Tool classes for custom tools. Mutually exclusive with tools.
reward_fn
RewardFunction | None
Reward function for evaluating agent responses. If None, returns 0 reward with a warning.
max_steps
int
default:"10"
Maximum number of steps before terminating the episode.

Properties

tools
MultiTool
MultiTool instance managing available tools.
step_count
int
Current step number in the episode.

Methods

reset

Reset the environment to the initial state.
obs, info = env.reset()
obs
dict
The task dictionary.
info
dict
Empty dictionary (for compatibility).

step

Execute tools based on agent action and return results.
next_obs, reward, done, info = env.step(action)
action
list[dict] | str | dict
Agent action. Can be:
  • list[dict]: Tool calls to execute
  • str: Final answer (triggers termination and reward computation)
  • dict: Single tool call
Termination conditions:
  • Agent provides a string response (final answer)
  • Agent calls “finish” tool
  • max_steps reached
next_obs
dict
Dictionary with “tool_outputs” mapping call IDs to results, or None if done.
reward
float
Reward from reward function (only computed on final step).
done
bool
Whether the episode has terminated.
info
dict
Additional information from the environment.

Example

from rllm.environments import ToolEnvironment
from rllm.rewards import math_reward_fn

# Create environment with Python tool
env = ToolEnvironment(
    task={
        "question": "What is 10 factorial?",
        "answer": "3628800"
    },
    tools=["python"],
    reward_fn=math_reward_fn,
    max_steps=5
)

# Reset
obs, info = env.reset()
print(obs)  # {"question": "What is 10 factorial?", "answer": "3628800"}

# Agent calls Python tool
tool_calls = [{
    "id": "call_123",
    "function": {
        "name": "python",
        "arguments": {"code": "import math; math.factorial(10)"}
    }
}]

next_obs, reward, done, info = env.step(tool_calls)
print(next_obs)  # {"tool_outputs": {"call_123": "3628800"}}
print(done)      # False

# Agent provides final answer
final_answer = "3628800"
next_obs, reward, done, info = env.step(final_answer)
print(reward)    # 1.0 (correct answer)
print(done)      # True

Tool Call Format

Tool calls should be dictionaries with this structure:
{
    "id": "unique_call_id",
    "function": {
        "name": "tool_name",
        "arguments": {"param": "value"}
    }
}
Special tool: finish - Signals completion and provides final answer:
{
    "id": "call_finish",
    "function": {
        "name": "finish",
        "arguments": {"response": "Final answer here"}
    }
}