Local HTML DOM Form Text Extract

Use only the task-provided mock site at $MOCK_SITE_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use screenshots, OCR, or any other internet resources.

Workspace, Tool Use & Multimodal OperationsTask 15Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations ยท Task 15

Use only the task-provided mock site at $MOCK_SITE_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use screenshots, OCR, or any other internet resources.

Read the HTML/DOM text, hidden fields, form actions, data attributes, and form responses. Submit the
correct local form flow and write:

  • $WORKSPACE/out/dom_extract.json
  • $WORKSPACE/out/interaction_trace.md
  • $WORKSPACE/out/source_urls.txt

Required dom_extract.json fields:
{
"selected_case_id": "CASE-204",
"visible_title": "Invoice export stalled",
"owner": "Mira Chen",
"priority": "P1",
"tags": ["billing", "export"],
"submitted_filters": {"queue": "ops", "priority": "P1"},
"confirmation_code": "CONF-204-OK",
"result_rows": [
{"step": "validate", "status": "passed"},
{"step": "export", "status": "queued"}
]
}

Avoid copying the whole HTML page or hidden template bait into the JSON.
source_urls.txt should contain the key mock-site URLs you used, one per line.

Input Files2 files
in/site_data.json
in/www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    www = workspace / "in" / "www"
    data_file = workspace / "in" / "site_data.json"
    log_path = workspace / "out" / "site_access.log"
    log_path.parent.mkdir(parents=True, exist_ok=True)
    port = 38000 + random.randint(0, 2000)
    script = textwrap.dedent(f"""
        import json
        from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
        from pathlib import Path
        from urllib.parse import parse_qs, urlparse

        WWW = Path({str(www)!r})
        DATA = json.loads(Path({str(data_file)!r}).read_text())
        LOG_PATH = Path({str(log_path)!r})

        class Handler(BaseHTTPRequestHandler):
            def _log(self, path):
                with LOG_PATH.open("a", encoding="utf-8") as f:
                    f.write(path + "\n")

            def _json(self, payload, code=200):
                body = json.dumps(payload, sort_keys=True).encode()
                self.send_response(code)
                self.send_header("Content-Type", "application/json")
                self.send_header("Content-Length", str(len(body)))
                self.end_headers()
                self.wfile.write(body)

            def do_GET(self):
                parsed = urlparse(self.path)
                self._log(parsed.path)
                if parsed.path == "/":
                    body = (WWW / "index.html").read_bytes()
                    self.send_response(200)
                    self.send_header("Content-Type", "text/html; charset=utf-8")
                    self.send_header("Content-Length", str(len(body)))
                    self.end_headers()
                    self.wfile.write(body)
                    return
                if parsed.path == "/detail":
                    html = (
                        '<!doctype html><main id="detail" data-case-id="' + DATA["case_id"] + '" data-owner="' + DATA["owner"] + '">'
                        '<h1>' + DATA["title"] + '</h1><dl><dt>Priority</dt><dd>' + DATA["priority"] + '</dd>'
                        '<dt>Tags</dt><dd>' + ",".join(DATA["tags"]) + '</dd></dl>'
                        '<a id="confirm-link" href="/confirm?case_id=' + DATA["case_id"] + '">Confirm</a></main>'
                    )
                    body = html.encode()
                    self.send_response(200)
                    self.send_header("Content-Type", "text/html; charset=utf-8")
                    self.send_header("Content-Length", str(len(body)))
                    self.end_headers()
                    self.wfile.write(body)
                    return
                if parsed.path == "/confirm":
                    qs = parse_qs(parsed.query)
                    ok = qs.get("case_id", [""])[0] == DATA["case_id"]
                    self._json({{"confirmation_code": DATA["confirmation_code"] if ok else None, "result_rows": DATA["result_rows"] if ok else []}}, 200 if ok else 400)
                    return
                self.send_response(404); self.end_headers()

            def do_POST(self):
                parsed = urlparse(self.path)
                self._log(parsed.path)
                length = int(self.headers.get("Content-Length", "0") or "0")
                form = parse_qs(self.rfile.read(length).decode("utf-8"))
                if parsed.path == "/search":
                    ok = form.get("csrf_token", [""])[0] == "csrf-local-204" and form.get("session_hint", [""])[0] == "queue-session-9" and form.get("queue", [""])[0] == "ops" and form.get("priority", [""])[0] == "P1"
                    payload = {{
                        "ok": ok,
                        "selected_case_id": DATA["case_id"] if ok else None,
                        "submitted_filters": {{"queue": form.get("queue", [""])[0], "priority": form.get("priority", [""])[0]}},
                        "detail_url": "/detail?case_id=" + DATA["case_id"] if ok else None,
                    }}
                    self._json(payload, 200 if ok else 400)
                    return
                self.send_response(404); self.end_headers()

            def log_message(self, fmt, *args):
                return

        ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
    """)
    proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(0.3)
    local_site = f"http://127.0.0.1:{port}"
    try:
        public_site, tunnel_proc = _start_public_tunnel(local_site)
    except Exception:
        try:
            proc.terminate()
        except OSError:
            pass
        raise
    return {"MOCK_SITE_BASE": public_site, "server_pid": proc.pid, "tunnel_pid": tunnel_proc.pid if tunnel_proc else 0}

def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
    for key in ("tunnel_pid", "server_pid"):
        pid = int(state.get(key, 0) or 0)
        if not pid:
            continue
        try:
            os.kill(pid, 15)
        except OSError:
            pass
LLM Rubric
USER_TEMPLATE = """This task: Use only the task-provided mock site at $MOCK_SITE_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use screenshots, OCR, or any other internet resources. Read the HTML/DOM text, hidden...

Key outputs: out/dom_extract.json, out/interaction_trace.md, out/source_urls.txt.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def _looks_like_task_url(value: str) -> bool:
    return value.startswith(("http://127.0.0.1:", "http://localhost:", "https://"))

def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})

def score_workspace(workspace: Path) -> dict[str, Any]:
    truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []
    try:
        result = json.loads((workspace / "out" / "dom_extract.json").read_text(encoding="utf-8"))
    except Exception as exc:
        result = {}
        _add(checks, "json_parse", "dom_extract.json parses", False, 1.0, str(exc))
    else:
        _add(checks, "json_parse", "dom_extract.json parses", isinstance(result, dict), 1.0)
    _add(checks, "result_exact", "dom extract exact", result == truth["dom_extract"], 8.0, repr(result))
    dumped = json.dumps(result, ensure_ascii=False)
    _add(checks, "no_bait", "result excludes template bait and whole HTML", not any(t.lower() in dumped.lower() for t in truth["forbidden_terms"]), 2.0)

    seen = (workspace / "out" / "site_access.log").read_text(encoding="utf-8", errors="replace").splitlines() if (workspace / "out" / "site_access.log").is_file() else []
    for path in truth["required_paths"]:
        _add(checks, "visited_" + (path.strip("/") or "index"), f"server log includes {path}", path in seen, 1.0, repr(seen))
    trace = (workspace / "out" / "interaction_trace.md").read_text(encoding="utf-8", errors="replace").lower() if (workspace / "out" / "interaction_trace.md").is_file() else ""
    _add(checks, "trace_terms", "interaction trace records token, hidden field, filters, confirmation", all(t.lower() in trace for t in truth["trace_terms"]), 3.0)
    urls = (workspace / "out" / "source_urls.txt").read_text(encoding="utf-8", errors="replace").splitlines() if (workspace / "out" / "source_urls.txt").is_file() else []
    _add(checks, "source_urls", "source_urls includes mock-site URLs", len(urls) >= 3 and all(_looks_like_task_url(u) for u in urls), 2.0, repr(urls))
    score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
    return {"task": "081-local-html-dom-form-extract", "workspace": str(workspace), "checks": checks, "outcome_score": score}