From 0bc7fd57cc1eedd0d88edd46d943b2d732e075d3 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 19 Nov 2025 17:55:31 +0000 Subject: [PATCH 01/14] add border & update to latest sdk --- benchmarks/utils/evaluation.py | 1 + uv.lock | 22 ++++++++++++++++++---- vendor/software-agent-sdk | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 2830f9aa..54176a8f 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -440,6 +440,7 @@ def reset_logger_for_multiprocessing(log_dir: str, instance_id: str) -> None: View live output: • tail -f {log_file} (logger) • tail -f {output_log_file} (stdout/stderr) + =============================================== """.strip() ) diff --git a/uv.lock b/uv.lock index 1a014d50..ce8beb09 100644 --- a/uv.lock +++ b/uv.lock @@ -669,6 +669,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d", size = 494825, upload-time = "2025-07-09T14:35:50.658Z" }, ] +[[package]] +name = "deprecation" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/d3/8ae2869247df154b64c1884d7346d412fed0c49df84db635aab2d1c40e62/deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff", size = 173788, upload-time = "2020-04-20T14:23:38.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -1987,7 +1999,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.1.0" +version = "1.2.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2083,9 +2095,10 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.1.0" +version = "1.2.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ + { name = "deprecation" }, { name = "fastmcp" }, { name = "httpx" }, { name = "litellm" }, @@ -2105,6 +2118,7 @@ boto3 = [ [package.metadata] requires-dist = [ { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, + { name = "deprecation", specifier = ">=2.1.0" }, { name = "fastmcp", specifier = ">=2.11.3" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.77.7.dev9" }, @@ -2119,7 +2133,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.1.0" +version = "1.2.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2146,7 +2160,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.1.0" +version = "1.2.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-sdk" }, diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index fb7197ac..e485bba9 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit fb7197ac1ec847da8624af67dc63d218107aae8a +Subproject commit e485bba962171d5fefdfa757f1d7dd245da598cd From 06f674a4353b6c07373914537233142ca08c8e0b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 20 Nov 2025 17:55:32 +0000 Subject: [PATCH 02/14] bump sdk commit --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index e485bba9..5cc0710e 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit e485bba962171d5fefdfa757f1d7dd245da598cd +Subproject commit 5cc0710e9c298d8fd33da1d230c5502d33cedd61 From 3423b33314fe8e810e40520baa0a8db002d5f807 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 20 Nov 2025 20:34:22 +0000 Subject: [PATCH 03/14] install sdk w/ critic group --- pyproject.toml | 2 +- uv.lock | 55 ++++++++++++++++++++++++++++++++++++--- vendor/software-agent-sdk | 2 +- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a7cf1cbe..3965606c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "Pillow", "toml", "tqdm", - "openhands-sdk", + "openhands-sdk[critic]", "openhands-tools", "unidiff>=0.7.5,<0.8.0", "openhands-agent-server", diff --git a/uv.lock b/uv.lock index ce8beb09..076966b9 100644 --- a/uv.lock +++ b/uv.lock @@ -2037,7 +2037,7 @@ dependencies = [ { name = "jinja2" }, { name = "modal" }, { name = "openhands-agent-server" }, - { name = "openhands-sdk" }, + { name = "openhands-sdk", extra = ["critic"] }, { name = "openhands-tools" }, { name = "openhands-workspace" }, { name = "pandas" }, @@ -2069,7 +2069,7 @@ requires-dist = [ { name = "jinja2" }, { name = "modal", specifier = ">=1.1.4" }, { name = "openhands-agent-server", editable = "vendor/software-agent-sdk/openhands-agent-server" }, - { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, + { name = "openhands-sdk", extras = ["critic"], editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "openhands-tools", editable = "vendor/software-agent-sdk/openhands-tools" }, { name = "openhands-workspace", editable = "vendor/software-agent-sdk/openhands-workspace" }, { name = "pandas" }, @@ -2106,6 +2106,7 @@ dependencies = [ { name = "pydantic" }, { name = "python-frontmatter" }, { name = "python-json-logger" }, + { name = "requests" }, { name = "tenacity" }, { name = "websockets" }, ] @@ -2114,6 +2115,9 @@ dependencies = [ boto3 = [ { name = "boto3" }, ] +critic = [ + { name = "transformers" }, +] [package.metadata] requires-dist = [ @@ -2126,10 +2130,12 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.11.7" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, + { name = "requests", specifier = ">=2.31.0" }, { name = "tenacity", specifier = ">=9.1.2" }, + { name = "transformers", marker = "extra == 'critic'", specifier = ">=4.30.0" }, { name = "websockets", specifier = ">=12" }, ] -provides-extras = ["boto3"] +provides-extras = ["boto3", "critic"] [[package]] name = "openhands-tools" @@ -5964,6 +5970,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" }, ] +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + [[package]] name = "screeninfo" version = "0.8.1" @@ -6228,6 +6256,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "transformers" +version = "4.57.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, +] + [[package]] name = "typer" version = "0.17.4" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 5cc0710e..64b75d60 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 5cc0710e9c298d8fd33da1d230c5502d33cedd61 +Subproject commit 64b75d60505e01c4d7661314aae2ef594bc309ab From 3e1486f7562f292d96d4b9620c10961b90abe6f9 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 20 Nov 2025 20:54:35 +0000 Subject: [PATCH 04/14] add APIBasedCritic --- benchmarks/utils/critics.py | 2 ++ vendor/software-agent-sdk | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index af9c55ae..fb1e781a 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -15,6 +15,7 @@ from openhands.sdk import get_logger from openhands.sdk.critic import ( AgentFinishedCritic, + APIBasedCritic, CriticBase, EmptyPatchCritic, PassCritic, @@ -29,6 +30,7 @@ "pass": PassCritic, "finish_with_patch": AgentFinishedCritic, "empty_patch_critic": EmptyPatchCritic, + "api": APIBasedCritic, } diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 64b75d60..1b640a68 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 64b75d60505e01c4d7661314aae2ef594bc309ab +Subproject commit 1b640a68be645474349d3294c5f2c530b2725034 From 067ad65fd01cc5c082c7844ab37d550db08c036c Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 25 Nov 2025 23:23:34 +0000 Subject: [PATCH 05/14] remove pyright config --- pyrightconfig.json | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 pyrightconfig.json diff --git a/pyrightconfig.json b/pyrightconfig.json deleted file mode 100644 index 1c3447c2..00000000 --- a/pyrightconfig.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "include": [ - "benchmarks" - ], - "extraPaths": [ - "vendor/software-agent-sdk/openhands-sdk", - "vendor/software-agent-sdk/openhands-tools", - "vendor/software-agent-sdk/openhands-workspace", - "vendor/software-agent-sdk/openhands-agent-server" - ], - "exclude": [ - ".venv/**", - "**/__pycache__", - "**/.pytest_cache" - ], - "pythonPath": ".venv/bin/python", - "venvPath": ".", - "venv": ".venv" -} \ No newline at end of file From cb4f3b89aed80292c2ba43749da896f700fb0395 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 25 Nov 2025 23:29:27 +0000 Subject: [PATCH 06/14] Revert "remove pyright config" This reverts commit 067ad65fd01cc5c082c7844ab37d550db08c036c. --- pyrightconfig.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 pyrightconfig.json diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 00000000..1c3447c2 --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,19 @@ +{ + "include": [ + "benchmarks" + ], + "extraPaths": [ + "vendor/software-agent-sdk/openhands-sdk", + "vendor/software-agent-sdk/openhands-tools", + "vendor/software-agent-sdk/openhands-workspace", + "vendor/software-agent-sdk/openhands-agent-server" + ], + "exclude": [ + ".venv/**", + "**/__pycache__", + "**/.pytest_cache" + ], + "pythonPath": ".venv/bin/python", + "venvPath": ".", + "venv": ".venv" +} \ No newline at end of file From 568b14392fe8ca6ac6f1cc5676b13142e4948d00 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 25 Nov 2025 23:30:57 +0000 Subject: [PATCH 07/14] Update pyrightconfig.json --- pyrightconfig.json | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/pyrightconfig.json b/pyrightconfig.json index 1c3447c2..0a61fee7 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,19 +1,12 @@ { - "include": [ - "benchmarks" - ], - "extraPaths": [ - "vendor/software-agent-sdk/openhands-sdk", - "vendor/software-agent-sdk/openhands-tools", - "vendor/software-agent-sdk/openhands-workspace", - "vendor/software-agent-sdk/openhands-agent-server" - ], - "exclude": [ - ".venv/**", - "**/__pycache__", - "**/.pytest_cache" - ], - "pythonPath": ".venv/bin/python", - "venvPath": ".", - "venv": ".venv" -} \ No newline at end of file + "include": ["benchmarks"], + "extraPaths": [ + "vendor/software-agent-sdk/openhands-sdk", + "vendor/software-agent-sdk/openhands-tools", + "vendor/software-agent-sdk/openhands-workspace", + "vendor/software-agent-sdk/openhands-agent-server" + ], + "exclude": [".venv/**", "**/__pycache__", "**/.pytest_cache"], + "venvPath": ".", + "venv": ".venv" +} From bad7d68e4440d826249f35d4967e82f161a0b9fd Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 25 Nov 2025 23:49:36 +0000 Subject: [PATCH 08/14] always require critic result --- benchmarks/utils/critics.py | 33 +++------------------------------ benchmarks/utils/evaluation.py | 33 ++++++++++++++++++++++++++------- benchmarks/utils/iterative.py | 17 +++++------------ benchmarks/utils/models.py | 5 ++++- 4 files changed, 38 insertions(+), 50 deletions(-) diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index fb1e781a..192821d4 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -20,7 +20,6 @@ EmptyPatchCritic, PassCritic, ) -from openhands.sdk.event import LLMConvertibleEvent logger = get_logger(__name__) @@ -124,31 +123,6 @@ def extract_git_patch(eval_output: EvalOutput) -> str | None: return eval_output.test_result.get("git_patch") -def evaluate_output(critic: CriticBase, eval_output: EvalOutput) -> bool: - """ - Evaluate an EvalOutput using a critic. - - This is a convenience function that extracts history and git_patch - from EvalOutput and calls the critic's evaluate method. - - Args: - critic: The SDK critic to use - eval_output: The evaluation output to check - - Returns: - True if the instance was successfully completed, False otherwise - """ - events = eval_output.history - llm_events: list[LLMConvertibleEvent] = [ - e for e in events if isinstance(e, LLMConvertibleEvent) - ] - - git_patch = extract_git_patch(eval_output) - result = critic.evaluate(llm_events, git_patch) - - return result.success - - def get_completed_instances(output_file: str) -> Set[EvalInstanceID]: """ Get all instance IDs present in output file @@ -188,13 +162,12 @@ def get_completed_instances(output_file: str) -> Set[EvalInstanceID]: return completed_instances -def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstanceID]: +def get_failed_instances(output_file: str) -> Set[EvalInstanceID]: """ Get the set of failed instance IDs from an output file. Args: output_file: Path to the JSONL output file - critic: SDK critic to use for evaluation Returns: Set of instance IDs that failed @@ -212,8 +185,8 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan data = json.loads(line.strip()) output = EvalOutput.model_validate(data) - # Evaluate using the critic - if not evaluate_output(critic, output): + # Check critic result (already set during evaluation) + if not output.critic_result.success: failed_instances.add(output.instance_id) except json.JSONDecodeError as e: diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 54176a8f..382d49f2 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -14,7 +14,7 @@ from tqdm import tqdm from benchmarks.utils.constants import OUTPUT_FILENAME -from benchmarks.utils.critics import get_completed_instances +from benchmarks.utils.critics import extract_git_patch, get_completed_instances from benchmarks.utils.iterative import aggregate_results, get_failed_instances from benchmarks.utils.models import ( EvalInstance, @@ -23,7 +23,8 @@ EvalOutput, ) from openhands.sdk import get_logger -from openhands.sdk.critic import CriticBase +from openhands.sdk.critic import CriticBase, CriticResult +from openhands.sdk.event import LLMConvertibleEvent from openhands.sdk.workspace import RemoteWorkspace @@ -88,15 +89,24 @@ def _create_error_output( self, instance: EvalInstance, error: Exception, retry_count: int ) -> EvalOutput: """Create an EvalOutput object for a failed instance.""" + error_msg = ( + f"Instance failed after {retry_count} retries. Last error: {str(error)}" + )[:200] + + # Create critic result with score=0 and error message + critic_result = CriticResult( + score=0.0, + message=error_msg, + ) + return EvalOutput( instance_id=instance.id, test_result={}, instruction=None, - error=( - f"Instance failed after {retry_count} retries. Last error: {str(error)}" - )[:200], + error=error_msg, history=[], instance=instance.data, + critic_result=critic_result, ) # --- Runner --- @@ -161,7 +171,7 @@ def _get_instances_for_attempt( if not os.path.exists(prev_file): return [] - failed_in_prev = get_failed_instances(prev_file, critic) + failed_in_prev = get_failed_instances(prev_file) return [ inst for inst in all_instances @@ -281,7 +291,6 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: aggregate_results( output_dir=self.metadata.eval_output_dir, max_attempts=self.metadata.max_attempts, - critic=self.metadata.critic, final_output_file="output.jsonl", ) @@ -350,6 +359,16 @@ def _process_one_mp( try: workspace = self.prepare_workspace(instance) out = self.evaluate_instance(instance, workspace) + + # Evaluate with critic and save result + llm_events = [ + e for e in out.history if isinstance(e, LLMConvertibleEvent) + ] + git_patch = extract_git_patch(out) + out.critic_result = self.metadata.critic.evaluate( + llm_events, git_patch + ) + logger.info("[child] done id=%s", instance.id) return instance, out except Exception as e: diff --git a/benchmarks/utils/iterative.py b/benchmarks/utils/iterative.py index 71ebf2c7..fa8949f3 100644 --- a/benchmarks/utils/iterative.py +++ b/benchmarks/utils/iterative.py @@ -9,7 +9,6 @@ import os from typing import Set -from benchmarks.utils.critics import CriticBase, evaluate_output from benchmarks.utils.models import EvalInstanceID, EvalOutput from openhands.sdk import get_logger @@ -17,13 +16,12 @@ logger = get_logger(__name__) -def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstanceID]: +def get_failed_instances(output_file: str) -> Set[EvalInstanceID]: """ Get the set of failed instance IDs from an output file. Args: output_file: Path to the JSONL output file - critic: SDK critic to use for evaluation Returns: Set of instance IDs that failed @@ -42,8 +40,8 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan data = json.loads(line.strip()) output = EvalOutput.model_validate(data) - # Evaluate using the critic - if not evaluate_output(critic, output): + # Check critic result (already set during evaluation) + if not output.critic_result.success: failed_instances.add(output.instance_id) except json.JSONDecodeError as e: @@ -65,7 +63,6 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan def aggregate_results( output_dir: str, max_attempts: int, - critic: "CriticBase", final_output_file: str = "output.jsonl", ) -> None: """ @@ -77,7 +74,6 @@ def aggregate_results( Args: output_dir: Directory containing attempt files max_attempts: Maximum number of attempts - critic: Critic instance to use for evaluation final_output_file: Name of the final output file """ logger.info(f"Aggregating results from {max_attempts} attempts") @@ -109,7 +105,7 @@ def aggregate_results( # 2. This attempt is the first one to succeed instance_id = output.instance_id - is_successful = evaluate_output(critic, output) + is_successful = output.critic_result.success if instance_id not in best_results: # First time seeing this instance @@ -117,10 +113,7 @@ def aggregate_results( elif is_successful: # This attempt succeeded, check if we should replace current_best = best_results[instance_id] - current_is_successful = evaluate_output( - critic, current_best - ) - if not current_is_successful: + if not current_best.critic_result.success: # Replace failed result with successful one best_results[instance_id] = output diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index eb5c954b..a97ee7fd 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field from openhands.sdk import LLM, Event, get_logger -from openhands.sdk.critic import CriticBase +from openhands.sdk.critic import CriticBase, CriticResult from openhands.sdk.llm import Metrics @@ -85,3 +85,6 @@ class EvalOutput(BaseModel): # Optionally save the input test instance instance: dict[str, Any] | None = None + + # Critic's evaluation result (always set during evaluation) + critic_result: CriticResult From c41280d96076d66f3afd8645724c1c3e04ca4cb1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 25 Nov 2025 23:52:10 +0000 Subject: [PATCH 09/14] update critic model --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 1b640a68..87452f47 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 1b640a68be645474349d3294c5f2c530b2725034 +Subproject commit 87452f47de572a218cd28325ae419ac6fe110dce From b3c041f5a265d5fd1d29f7849bd84f23bb7d91e3 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 26 Nov 2025 02:11:55 +0000 Subject: [PATCH 10/14] update commit --- uv.lock | 8 ++++---- vendor/software-agent-sdk | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/uv.lock b/uv.lock index 076966b9..eb9f02f4 100644 --- a/uv.lock +++ b/uv.lock @@ -1999,7 +1999,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.2.0" +version = "1.3.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2095,7 +2095,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.2.0" +version = "1.3.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "deprecation" }, @@ -2139,7 +2139,7 @@ provides-extras = ["boto3", "critic"] [[package]] name = "openhands-tools" -version = "1.2.0" +version = "1.3.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2166,7 +2166,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.2.0" +version = "1.3.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-sdk" }, diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 87452f47..3fb0068f 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 87452f47de572a218cd28325ae419ac6fe110dce +Subproject commit 3fb0068f44bb31c7582b20796f004703e41bd96c From 54c3f2157fa62374534afab023e9a045e23d0dc0 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 26 Nov 2025 04:20:56 +0000 Subject: [PATCH 11/14] fix run infer for swebench --- benchmarks/swe_bench/run_infer.py | 12 ++++++++---- benchmarks/utils/critics.py | 15 --------------- benchmarks/utils/evaluation.py | 24 +++++++++++------------- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 40a5c561..d95c3ad3 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -259,16 +259,20 @@ def _log_event(ev): # keep it simple ) git_patch = git_patch_result.stdout + history = list(conversation.state.events) + test_result = { + "git_patch": git_patch, + } + # EvalOutput is your model; keep fields consistent with prior JSONL out = EvalOutput( instance_id=instance.id, - test_result={ - "git_patch": git_patch, - }, + test_result=test_result, instruction=instruction, error=None, - history=list(conversation.state.events), + history=history, metrics=conversation.conversation_stats.get_combined_metrics(), + critic_result=self.critic_evaluate(history, test_result=test_result), ) return out diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index 192821d4..381a8b3c 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -108,21 +108,6 @@ def create_critic(args: Namespace) -> CriticBase: ) -def extract_git_patch(eval_output: EvalOutput) -> str | None: - """ - Extract git patch from EvalOutput. - - Args: - eval_output: The evaluation output - - Returns: - Git patch string or None if not present - """ - if not eval_output.test_result: - return None - return eval_output.test_result.get("git_patch") - - def get_completed_instances(output_file: str) -> Set[EvalInstanceID]: """ Get all instance IDs present in output file diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 382d49f2..b72fc02b 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -8,13 +8,13 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, as_completed from contextlib import contextmanager -from typing import Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from pydantic import BaseModel, Field from tqdm import tqdm from benchmarks.utils.constants import OUTPUT_FILENAME -from benchmarks.utils.critics import extract_git_patch, get_completed_instances +from benchmarks.utils.critics import get_completed_instances from benchmarks.utils.iterative import aggregate_results, get_failed_instances from benchmarks.utils.models import ( EvalInstance, @@ -22,7 +22,7 @@ EvalMetadata, EvalOutput, ) -from openhands.sdk import get_logger +from openhands.sdk import Event, get_logger from openhands.sdk.critic import CriticBase, CriticResult from openhands.sdk.event import LLMConvertibleEvent from openhands.sdk.workspace import RemoteWorkspace @@ -85,6 +85,14 @@ def evaluate_instance( """Run evaluation for a single instance in the provided workspace.""" raise NotImplementedError + def critic_evaluate( + self, history: list[Event], test_result: dict[str, Any] + ) -> CriticResult: + """Evaluate the instance using the configured critic.""" + llm_events = [e for e in history if isinstance(e, LLMConvertibleEvent)] + git_patch = test_result.get("git_patch") + return self.metadata.critic.evaluate(llm_events, git_patch) + def _create_error_output( self, instance: EvalInstance, error: Exception, retry_count: int ) -> EvalOutput: @@ -359,16 +367,6 @@ def _process_one_mp( try: workspace = self.prepare_workspace(instance) out = self.evaluate_instance(instance, workspace) - - # Evaluate with critic and save result - llm_events = [ - e for e in out.history if isinstance(e, LLMConvertibleEvent) - ] - git_patch = extract_git_patch(out) - out.critic_result = self.metadata.critic.evaluate( - llm_events, git_patch - ) - logger.info("[child] done id=%s", instance.id) return instance, out except Exception as e: From 0d42292d490638a669c6b7bb2eb4b0e37e6d6629 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 26 Nov 2025 04:37:40 +0000 Subject: [PATCH 12/14] bump commit --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 3fb0068f..6371a640 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 3fb0068f44bb31c7582b20796f004703e41bd96c +Subproject commit 6371a640479a7886778dfe2d3cfe4a5a3977fdd9 From 5acec8a985470472f441c7dd85b92fee5fe79934 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 26 Nov 2025 04:37:47 +0000 Subject: [PATCH 13/14] print more errors --- benchmarks/utils/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index b72fc02b..b2851744 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -99,7 +99,7 @@ def _create_error_output( """Create an EvalOutput object for a failed instance.""" error_msg = ( f"Instance failed after {retry_count} retries. Last error: {str(error)}" - )[:200] + )[:1000] # Create critic result with score=0 and error message critic_result = CriticResult( From fb628a356a1214f18e108bc51dea85623bf58439 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 26 Nov 2025 04:50:00 +0000 Subject: [PATCH 14/14] bump commit --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 6371a640..3c47b6fc 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 6371a640479a7886778dfe2d3cfe4a5a3977fdd9 +Subproject commit 3c47b6fc55f85242d519bdf3dfaf58c56ea2e31b