Model Runs
6 harnesses & 8 models evaluated on this task.
Loading...
Prompt
Workspace, Tool Use & Multimodal Operations ยท Task 3
Open the page at $MOCK_PAGE, read the visible text on the page that is directly relevant to this benchmark, and save your excerpt to $WORKSPACE/out/page_extract.txt.
Requirements:
- Write at least one line.
- The excerpt must include the page's key marker text.
Input Files
1 file
www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
port = 31000 + random.randint(0, 2000)
www = workspace / "www"
proc = subprocess.Popen(
["python3", "-m", "http.server", str(port), "--directory", str(www)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
time.sleep(0.3)
local_page = f"http://127.0.0.1:{port}/"
try:
public_page, tunnel_proc = _start_public_tunnel(local_page.rstrip("/"))
except Exception:
try:
proc.terminate()
except OSError:
pass
raise
return {
"MOCK_PAGE": public_page + "/",
"server_pid": proc.pid,
"tunnel_pid": tunnel_proc.pid if tunnel_proc else 0,
}
def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
import os
for key in ("tunnel_pid", "server_pid"):
pid = int(state.get(key, 0) or 0)
if pid <= 0:
continue
try:
os.kill(pid, 15)
except OSError:
pass
LLM Rubric
USER_TEMPLATE = """This task: visit the task-provided MOCK_PAGE URL
(static page), then write visible page content into out/page_extract.txt.
Oracle only checks that file contains a fixed substring; it does not verify
HTTP or which tool was used.
Evaluate the agent run:
- tool_use_appropriate: prioritize use of browser and/or terminal fetch
against the given URL, and file write to out/page_extract.txt.
- consistency: open/fetch page -> extract relevant text -> write output file.
- robustness: connection/refusal or wrong port handled or retried.
Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def score_workspace(workspace: Path) -> dict[str, Any]:
target = workspace / "out" / "page_extract.txt"
text = ""
if target.is_file():
text = target.read_text(encoding="utf-8", errors="replace")
ok = "BENCHMARK_PAGE" in text
return {
"task": "003-browser",
"workspace": str(workspace),
"checks": [
{
"id": "page_extract",
"label": "out/page_extract.txt contains BENCHMARK_PAGE",
"pass": ok,
"weight": 1.0,
"detail": None if ok else "missing marker",
}
],
"outcome_score": 1.0 if ok else 0.0,
}