Skip to main content
The rewards module provides customizable reward functions for various task domains including math, code, and search.

RewardFunction

Protocol defining the interface for reward functions.
from rllm.rewards import RewardFunction

class RewardFunction(Protocol):
    def __call__(self, task_info: dict, action: str) -> RewardOutput:
        ...
task_info
dict
Task dictionary containing question, answer, and other metadata.
action
str | Action
Agent’s response/solution. Can be a string or Action object.
output
RewardOutput
Reward output containing reward value and metadata.

RewardOutput

Dataclass for reward function results.
from rllm.rewards import RewardOutput

output = RewardOutput(reward=1.0, metadata={"correct": True})
reward
float
The computed reward value (typically 0.0 or 1.0).
metadata
dict
Additional information about the evaluation.

Built-in Reward Functions

math_reward_fn

Reward function for mathematical reasoning tasks.
from rllm.rewards import math_reward_fn

task = {
    "question": "What is 2 + 2?",
    "answer": "4",
    "data_source": "hendrycks_math"  # optional
}
action = "The answer is 4"

output = math_reward_fn(task, action)
print(output.reward)  # 1.0 if correct, 0.0 otherwise
Supported task fields:
  • answer or ground_truth: Expected answer
  • data_source: Dataset identifier (optional)

code_reward_fn

Reward function for code generation tasks with execution-based evaluation.
from rllm.rewards import code_reward_fn

task = {
    "test_cases": [
        {"input": [2, 3], "output": 5},
        {"input": [10, 20], "output": 30}
    ],
    "entry_point": "add_numbers"
}
action = '''
def add_numbers(a, b):
    return a + b
'''

output = code_reward_fn(task, action)
print(output.reward)  # 1.0 if all tests pass
Supported task fields:
  • test_cases: List of input/output test pairs
  • entry_point: Function name to test
  • timeout: Execution timeout in seconds (optional)

search_reward_fn

Reward function for information retrieval and question answering tasks.
from rllm.rewards import search_reward_fn

task = {
    "question": "What is the capital of France?",
    "ground_truth": "Paris",
    "supporting_facts": [["France", 0]]  # optional
}
action = "The capital of France is Paris."

output = search_reward_fn(task, action)
print(output.reward)  # F1 score or exact match
Supported task fields:
  • ground_truth or answer: Expected answer
  • supporting_facts: Supporting evidence (optional)
  • data_source: Dataset identifier (optional)

f1_reward_fn

Generic F1 score-based reward function using token overlap.
from rllm.rewards import f1_reward_fn

task = {"ground_truth": "The quick brown fox"}
action = "the quick fox"

output = f1_reward_fn(task, action)
print(output.reward)  # F1 score based on normalized token overlap
Features:
  • Normalizes text (lowercase, remove punctuation/articles)
  • Computes token-level precision and recall
  • Returns F1 score as reward

zero_reward

Placeholder reward function that always returns 0.
from rllm.rewards import zero_reward

output = zero_reward(task_info={}, action="anything")
print(output.reward)  # Always 0.0

Custom Reward Functions

Example: Custom Keyword Reward

from rllm.rewards import RewardOutput

def keyword_reward_fn(task_info: dict, action: str) -> RewardOutput:
    """Reward agent if response contains required keywords."""
    required_keywords = task_info.get("keywords", [])
    action_lower = action.lower()
    
    matches = sum(1 for kw in required_keywords if kw.lower() in action_lower)
    reward = matches / len(required_keywords) if required_keywords else 0.0
    
    return RewardOutput(
        reward=reward,
        metadata={"matches": matches, "total": len(required_keywords)}
    )

# Usage
task = {"keywords": ["machine learning", "neural network"]}
action = "Machine learning uses neural networks for prediction."

output = keyword_reward_fn(task, action)
print(output.reward)  # 1.0 (both keywords present)

Example: Custom Length Penalty

from rllm.rewards import RewardOutput, math_reward_fn

def concise_math_reward(task_info: dict, action: str) -> RewardOutput:
    """Math reward with penalty for overly long responses."""
    # Get base math reward
    base_output = math_reward_fn(task_info, action)
    
    # Apply length penalty
    max_length = task_info.get("max_length", 500)
    length_penalty = max(0, (len(action) - max_length) / 1000)
    
    final_reward = max(0, base_output.reward - length_penalty)
    
    return RewardOutput(
        reward=final_reward,
        metadata={
            **base_output.metadata,
            "base_reward": base_output.reward,
            "length_penalty": length_penalty,
            "response_length": len(action)
        }
    )

Example: Multi-Criteria Reward

from rllm.rewards import RewardOutput
import re

def multi_criteria_reward(task_info: dict, action: str) -> RewardOutput:
    """Reward based on multiple criteria with weights."""
    # Criterion 1: Correctness
    correct = task_info["answer"].lower() in action.lower()
    correctness_score = 1.0 if correct else 0.0
    
    # Criterion 2: Explanation quality (has reasoning words)
    reasoning_words = ["because", "therefore", "thus", "since"]
    has_reasoning = any(word in action.lower() for word in reasoning_words)
    reasoning_score = 1.0 if has_reasoning else 0.0
    
    # Criterion 3: Formatting (uses proper structure)
    has_structure = bool(re.search(r'\n\n|\d+\.', action))
    formatting_score = 1.0 if has_structure else 0.0
    
    # Weighted combination
    weights = {"correctness": 0.7, "reasoning": 0.2, "formatting": 0.1}
    final_reward = (
        weights["correctness"] * correctness_score +
        weights["reasoning"] * reasoning_score +
        weights["formatting"] * formatting_score
    )
    
    return RewardOutput(
        reward=final_reward,
        metadata={
            "correctness": correctness_score,
            "reasoning": reasoning_score,
            "formatting": formatting_score
        }
    )

Using Rewards with Environments

from rllm.environments import SingleTurnEnvironment
from rllm.rewards import math_reward_fn

env = SingleTurnEnvironment(
    task={"question": "What is 5 * 7?", "answer": "35"},
    reward_fn=math_reward_fn
)

obs, info = env.reset()
action = "35"
next_obs, reward, done, info = env.step(action)

print(f"Reward: {reward}")  # 1.0 if correct

Using Rewards with Workflows

from rllm.workflows import SimpleWorkflow
from rllm.rewards import search_reward_fn

workflow = SimpleWorkflow(
    rollout_engine=engine,
    reward_function=search_reward_fn,
    executor=executor
)

episode = await workflow.run(
    task={"question": "...", "ground_truth": "..."},
    uid="task_0"
)