Use only the task-provided mock site at $MOCK_SITE_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use screenshots, OCR, or any other internet resources.
Read the HTML/DOM text, hidden fields, form actions, data attributes, and form responses. Submit the
correct local form flow and write:
- $WORKSPACE/out/dom_extract.json
- $WORKSPACE/out/interaction_trace.md
- $WORKSPACE/out/source_urls.txt
Required dom_extract.json fields:
{
"selected_case_id": "CASE-204",
"visible_title": "Invoice export stalled",
"owner": "Mira Chen",
"priority": "P1",
"tags": ["billing", "export"],
"submitted_filters": {"queue": "ops", "priority": "P1"},
"confirmation_code": "CONF-204-OK",
"result_rows": [
{"step": "validate", "status": "passed"},
{"step": "export", "status": "queued"}
]
}
Avoid copying the whole HTML page or hidden template bait into the JSON.
source_urls.txt should contain the key mock-site URLs you used, one per line.
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
www = workspace / "in" / "www"
data_file = workspace / "in" / "site_data.json"
log_path = workspace / "out" / "site_access.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
port = 38000 + random.randint(0, 2000)
script = textwrap.dedent(f"""
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from urllib.parse import parse_qs, urlparse
WWW = Path({str(www)!r})
DATA = json.loads(Path({str(data_file)!r}).read_text())
LOG_PATH = Path({str(log_path)!r})
class Handler(BaseHTTPRequestHandler):
def _log(self, path):
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(path + "\n")
def _json(self, payload, code=200):
body = json.dumps(payload, sort_keys=True).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def do_GET(self):
parsed = urlparse(self.path)
self._log(parsed.path)
if parsed.path == "/":
body = (WWW / "index.html").read_bytes()
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
if parsed.path == "/detail":
html = (
'<!doctype html><main id="detail" data-case-id="' + DATA["case_id"] + '" data-owner="' + DATA["owner"] + '">'
'<h1>' + DATA["title"] + '</h1><dl><dt>Priority</dt><dd>' + DATA["priority"] + '</dd>'
'<dt>Tags</dt><dd>' + ",".join(DATA["tags"]) + '</dd></dl>'
'<a id="confirm-link" href="/confirm?case_id=' + DATA["case_id"] + '">Confirm</a></main>'
)
body = html.encode()
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
if parsed.path == "/confirm":
qs = parse_qs(parsed.query)
ok = qs.get("case_id", [""])[0] == DATA["case_id"]
self._json({{"confirmation_code": DATA["confirmation_code"] if ok else None, "result_rows": DATA["result_rows"] if ok else []}}, 200 if ok else 400)
return
self.send_response(404); self.end_headers()
def do_POST(self):
parsed = urlparse(self.path)
self._log(parsed.path)
length = int(self.headers.get("Content-Length", "0") or "0")
form = parse_qs(self.rfile.read(length).decode("utf-8"))
if parsed.path == "/search":
ok = form.get("csrf_token", [""])[0] == "csrf-local-204" and form.get("session_hint", [""])[0] == "queue-session-9" and form.get("queue", [""])[0] == "ops" and form.get("priority", [""])[0] == "P1"
payload = {{
"ok": ok,
"selected_case_id": DATA["case_id"] if ok else None,
"submitted_filters": {{"queue": form.get("queue", [""])[0], "priority": form.get("priority", [""])[0]}},
"detail_url": "/detail?case_id=" + DATA["case_id"] if ok else None,
}}
self._json(payload, 200 if ok else 400)
return
self.send_response(404); self.end_headers()
def log_message(self, fmt, *args):
return
ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
""")
proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(0.3)
local_site = f"http://127.0.0.1:{port}"
try:
public_site, tunnel_proc = _start_public_tunnel(local_site)
except Exception:
try:
proc.terminate()
except OSError:
pass
raise
return {"MOCK_SITE_BASE": public_site, "server_pid": proc.pid, "tunnel_pid": tunnel_proc.pid if tunnel_proc else 0}
def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
for key in ("tunnel_pid", "server_pid"):
pid = int(state.get(key, 0) or 0)
if not pid:
continue
try:
os.kill(pid, 15)
except OSError:
passUSER_TEMPLATE = """This task: Use only the task-provided mock site at $MOCK_SITE_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use screenshots, OCR, or any other internet resources. Read the HTML/DOM text, hidden...
Key outputs: out/dom_extract.json, out/interaction_trace.md, out/source_urls.txt.
Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.
Return ONLY JSON with scores, security_gate, and notes.
"""def _looks_like_task_url(value: str) -> bool:
return value.startswith(("http://127.0.0.1:", "http://localhost:", "https://"))
def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})
def score_workspace(workspace: Path) -> dict[str, Any]:
truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
try:
result = json.loads((workspace / "out" / "dom_extract.json").read_text(encoding="utf-8"))
except Exception as exc:
result = {}
_add(checks, "json_parse", "dom_extract.json parses", False, 1.0, str(exc))
else:
_add(checks, "json_parse", "dom_extract.json parses", isinstance(result, dict), 1.0)
_add(checks, "result_exact", "dom extract exact", result == truth["dom_extract"], 8.0, repr(result))
dumped = json.dumps(result, ensure_ascii=False)
_add(checks, "no_bait", "result excludes template bait and whole HTML", not any(t.lower() in dumped.lower() for t in truth["forbidden_terms"]), 2.0)
seen = (workspace / "out" / "site_access.log").read_text(encoding="utf-8", errors="replace").splitlines() if (workspace / "out" / "site_access.log").is_file() else []
for path in truth["required_paths"]:
_add(checks, "visited_" + (path.strip("/") or "index"), f"server log includes {path}", path in seen, 1.0, repr(seen))
trace = (workspace / "out" / "interaction_trace.md").read_text(encoding="utf-8", errors="replace").lower() if (workspace / "out" / "interaction_trace.md").is_file() else ""
_add(checks, "trace_terms", "interaction trace records token, hidden field, filters, confirmation", all(t.lower() in trace for t in truth["trace_terms"]), 3.0)
urls = (workspace / "out" / "source_urls.txt").read_text(encoding="utf-8", errors="replace").splitlines() if (workspace / "out" / "source_urls.txt").is_file() else []
_add(checks, "source_urls", "source_urls includes mock-site URLs", len(urls) >= 3 and all(_looks_like_task_url(u) for u in urls), 2.0, repr(urls))
score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
return {"task": "081-local-html-dom-form-extract", "workspace": str(workspace), "checks": checks, "outcome_score": score}