Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,16 +259,20 @@ def _log_event(ev): # keep it simple
)
git_patch = git_patch_result.stdout

history = list(conversation.state.events)
test_result = {
"git_patch": git_patch,
}

# EvalOutput is your model; keep fields consistent with prior JSONL
out = EvalOutput(
instance_id=instance.id,
test_result={
"git_patch": git_patch,
},
test_result=test_result,
instruction=instruction,
error=None,
history=list(conversation.state.events),
history=history,
metrics=conversation.conversation_stats.get_combined_metrics(),
critic_result=self.critic_evaluate(history, test_result=test_result),
)
return out

Expand Down
50 changes: 5 additions & 45 deletions benchmarks/utils/critics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from openhands.sdk import get_logger
from openhands.sdk.critic import (
AgentFinishedCritic,
APIBasedCritic,
CriticBase,
EmptyPatchCritic,
PassCritic,
)
from openhands.sdk.event import LLMConvertibleEvent


logger = get_logger(__name__)
Expand All @@ -29,6 +29,7 @@
"pass": PassCritic,
"finish_with_patch": AgentFinishedCritic,
"empty_patch_critic": EmptyPatchCritic,
"api": APIBasedCritic,
}


Expand Down Expand Up @@ -107,46 +108,6 @@ def create_critic(args: Namespace) -> CriticBase:
)


def extract_git_patch(eval_output: EvalOutput) -> str | None:
"""
Extract git patch from EvalOutput.

Args:
eval_output: The evaluation output

Returns:
Git patch string or None if not present
"""
if not eval_output.test_result:
return None
return eval_output.test_result.get("git_patch")


def evaluate_output(critic: CriticBase, eval_output: EvalOutput) -> bool:
"""
Evaluate an EvalOutput using a critic.

This is a convenience function that extracts history and git_patch
from EvalOutput and calls the critic's evaluate method.

Args:
critic: The SDK critic to use
eval_output: The evaluation output to check

Returns:
True if the instance was successfully completed, False otherwise
"""
events = eval_output.history
llm_events: list[LLMConvertibleEvent] = [
e for e in events if isinstance(e, LLMConvertibleEvent)
]

git_patch = extract_git_patch(eval_output)
result = critic.evaluate(llm_events, git_patch)

return result.success


def get_completed_instances(output_file: str) -> Set[EvalInstanceID]:
"""
Get all instance IDs present in output file
Expand Down Expand Up @@ -186,13 +147,12 @@ def get_completed_instances(output_file: str) -> Set[EvalInstanceID]:
return completed_instances


def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstanceID]:
def get_failed_instances(output_file: str) -> Set[EvalInstanceID]:
"""
Get the set of failed instance IDs from an output file.

Args:
output_file: Path to the JSONL output file
critic: SDK critic to use for evaluation

Returns:
Set of instance IDs that failed
Expand All @@ -210,8 +170,8 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan
data = json.loads(line.strip())
output = EvalOutput.model_validate(data)

# Evaluate using the critic
if not evaluate_output(critic, output):
# Check critic result (already set during evaluation)
if not output.critic_result.success:
failed_instances.add(output.instance_id)

except json.JSONDecodeError as e:
Expand Down
33 changes: 25 additions & 8 deletions benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import contextmanager
from pathlib import Path
from typing import Callable, List, Optional, Tuple
from typing import Any, Callable, List, Optional, Tuple

from pydantic import BaseModel, Field
from tqdm import tqdm
Expand All @@ -24,8 +24,9 @@
EvalMetadata,
EvalOutput,
)
from openhands.sdk import get_logger
from openhands.sdk.critic import CriticBase
from openhands.sdk import Event, get_logger
from openhands.sdk.critic import CriticBase, CriticResult
from openhands.sdk.event import LLMConvertibleEvent
from openhands.sdk.workspace import RemoteWorkspace


Expand Down Expand Up @@ -86,19 +87,36 @@ def evaluate_instance(
"""Run evaluation for a single instance in the provided workspace."""
raise NotImplementedError

def critic_evaluate(
self, history: list[Event], test_result: dict[str, Any]
) -> CriticResult:
"""Evaluate the instance using the configured critic."""
llm_events = [e for e in history if isinstance(e, LLMConvertibleEvent)]
git_patch = test_result.get("git_patch")
return self.metadata.critic.evaluate(llm_events, git_patch)

def _create_error_output(
self, instance: EvalInstance, error: Exception, retry_count: int
) -> EvalOutput:
"""Create an EvalOutput object for a failed instance."""
error_msg = (
f"Instance failed after {retry_count} retries. Last error: {str(error)}"
)[:1000]

# Create critic result with score=0 and error message
critic_result = CriticResult(
score=0.0,
message=error_msg,
)

return EvalOutput(
instance_id=instance.id,
test_result={},
instruction=None,
error=(
f"Instance failed after {retry_count} retries. Last error: {str(error)}"
)[:200],
error=error_msg,
history=[],
instance=instance.data,
critic_result=critic_result,
)

def _capture_conversation_archive(
Expand Down Expand Up @@ -217,7 +235,7 @@ def _get_instances_for_attempt(
if not os.path.exists(prev_file):
return []

failed_in_prev = get_failed_instances(prev_file, critic)
failed_in_prev = get_failed_instances(prev_file)
return [
inst
for inst in all_instances
Expand Down Expand Up @@ -337,7 +355,6 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
aggregate_results(
output_dir=self.metadata.eval_output_dir,
max_attempts=self.metadata.max_attempts,
critic=self.metadata.critic,
final_output_file="output.jsonl",
)

Expand Down
17 changes: 5 additions & 12 deletions benchmarks/utils/iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,19 @@
import os
from typing import Set

from benchmarks.utils.critics import CriticBase, evaluate_output
from benchmarks.utils.models import EvalInstanceID, EvalOutput
from openhands.sdk import get_logger


logger = get_logger(__name__)


def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstanceID]:
def get_failed_instances(output_file: str) -> Set[EvalInstanceID]:
"""
Get the set of failed instance IDs from an output file.

Args:
output_file: Path to the JSONL output file
critic: SDK critic to use for evaluation

Returns:
Set of instance IDs that failed
Expand All @@ -42,8 +40,8 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan
data = json.loads(line.strip())
output = EvalOutput.model_validate(data)

# Evaluate using the critic
if not evaluate_output(critic, output):
# Check critic result (already set during evaluation)
if not output.critic_result.success:
failed_instances.add(output.instance_id)

except json.JSONDecodeError as e:
Expand All @@ -65,7 +63,6 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan
def aggregate_results(
output_dir: str,
max_attempts: int,
critic: "CriticBase",
final_output_file: str = "output.jsonl",
) -> None:
"""
Expand All @@ -77,7 +74,6 @@ def aggregate_results(
Args:
output_dir: Directory containing attempt files
max_attempts: Maximum number of attempts
critic: Critic instance to use for evaluation
final_output_file: Name of the final output file
"""
logger.info(f"Aggregating results from {max_attempts} attempts")
Expand Down Expand Up @@ -109,18 +105,15 @@ def aggregate_results(
# 2. This attempt is the first one to succeed
instance_id = output.instance_id

is_successful = evaluate_output(critic, output)
is_successful = output.critic_result.success

if instance_id not in best_results:
# First time seeing this instance
best_results[instance_id] = output
elif is_successful:
# This attempt succeeded, check if we should replace
current_best = best_results[instance_id]
current_is_successful = evaluate_output(
critic, current_best
)
if not current_is_successful:
if not current_best.critic_result.success:
# Replace failed result with successful one
best_results[instance_id] = output

Expand Down
5 changes: 4 additions & 1 deletion benchmarks/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pydantic import BaseModel, Field

from openhands.sdk import LLM, Event, get_logger
from openhands.sdk.critic import CriticBase
from openhands.sdk.critic import CriticBase, CriticResult
from openhands.sdk.llm import Metrics


Expand Down Expand Up @@ -85,3 +85,6 @@ class EvalOutput(BaseModel):

# Optionally save the input test instance
instance: dict[str, Any] | None = None

# Critic's evaluation result (always set during evaluation)
critic_result: CriticResult
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dependencies = [
"Pillow",
"toml",
"tqdm",
"openhands-sdk",
"openhands-sdk[critic]",
"openhands-tools",
"unidiff>=0.7.5,<0.8.0",
"openhands-agent-server",
Expand Down
29 changes: 11 additions & 18 deletions pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
{
"include": [
"benchmarks"
],
"extraPaths": [
"vendor/software-agent-sdk/openhands-sdk",
"vendor/software-agent-sdk/openhands-tools",
"vendor/software-agent-sdk/openhands-workspace",
"vendor/software-agent-sdk/openhands-agent-server"
],
"exclude": [
".venv/**",
"**/__pycache__",
"**/.pytest_cache"
],
"pythonPath": ".venv/bin/python",
"venvPath": ".",
"venv": ".venv"
}
"include": ["benchmarks"],
"extraPaths": [
"vendor/software-agent-sdk/openhands-sdk",
"vendor/software-agent-sdk/openhands-tools",
"vendor/software-agent-sdk/openhands-workspace",
"vendor/software-agent-sdk/openhands-agent-server"
],
"exclude": [".venv/**", "**/__pycache__", "**/.pytest_cache"],
"venvPath": ".",
"venv": ".venv"
}
Loading
Loading