Local web form extraction

Use the local web page provided in the MOCK_FORM_URL environment variable. Do not use the public internet.

Workspace, Tool Use & Multimodal OperationsTask 10Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations ยท Task 10

Use the local web page provided in the MOCK_FORM_URL environment variable. Do not use the public internet.

Task:

  1. Open the local page.
  2. Extract the CSRF token and hidden fields from the form.
  3. Submit the lookup form with:
  • order_id: A-1042
  • region: emea
  1. Follow the confirmation step if returned.
  2. Save the extracted result to $WORKSPACE/out/form_result.json and write $WORKSPACE/out/interaction_trace.md.

Required JSON format:
{
"query": {"order_id": "A-1042", "region": "emea"},
"marker": "FORM_LOOKUP_OK",
"result": {
"order_id": "A-1042",
"region": "emea",
"customer": "Nadia Rossi",
"status": "ready_for_invoice",
"total_usd": 1842.75,
"line_count": 3
}
}

Rules:

  • The marker must come from the lookup result page or response.
  • Include confirm_marker: "FORM_CONFIRM_OK" from the confirmation response.
  • Do not include unrelated page text, HTML, or the full index page in $WORKSPACE/out/form_result.json.
  • Numeric values must be numbers, not strings.
  • Do not modify $WORKSPACE/in/www/index.html.
  • interaction_trace.md must mention the homepage visit, CSRF token, hidden field, lookup submission, confirmation step, and final marker.
Input Files1 file
in/www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    www = workspace / "in" / "www"
    log_path = workspace / "out" / "form_access.log"
    log_path.parent.mkdir(parents=True, exist_ok=True)
    port = 34000 + random.randint(0, 2000)
    script = textwrap.dedent(
        f"""
        import json
        from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
        from pathlib import Path
        from urllib.parse import parse_qs, urlparse

        WWW = Path({str(www)!r})
        LOG_PATH = Path({str(log_path)!r})

        ORDER = {{
            "order_id": "A-1042",
            "region": "emea",
            "customer": "Nadia Rossi",
            "status": "ready_for_invoice",
            "total_usd": 1842.75,
            "line_count": 3,
        }}

        class Handler(BaseHTTPRequestHandler):
            def do_GET(self):
                parsed = urlparse(self.path)
                with LOG_PATH.open("a", encoding="utf-8") as f:
                    f.write(parsed.path + "\n")
                if parsed.path == "/":
                    body = (WWW / "index.html").read_bytes()
                    self.send_response(200)
                    self.send_header("Content-Type", "text/html; charset=utf-8")
                    self.send_header("Content-Length", str(len(body)))
                    self.end_headers()
                    self.wfile.write(body)
                    return
                if parsed.path == "/confirm":
                    qs = parse_qs(parsed.query)
                    ok = qs.get("confirm_id", [""])[0] == "confirm-A-1042"
                    payload = {{
                        "marker": "FORM_CONFIRM_OK" if ok else "FORM_CONFIRM_FAILED",
                        "result": ORDER if ok else None,
                    }}
                    body = json.dumps(payload, sort_keys=True).encode("utf-8")
                    self.send_response(200)
                    self.send_header("Content-Type", "application/json")
                    self.send_header("Content-Length", str(len(body)))
                    self.end_headers()
                    self.wfile.write(body)
                    return
                self.send_response(404)
                self.end_headers()

            def do_POST(self):
                parsed = urlparse(self.path)
                with LOG_PATH.open("a", encoding="utf-8") as f:
                    f.write(parsed.path + "\n")
                length = int(self.headers.get("Content-Length", "0") or "0")
                form = parse_qs(self.rfile.read(length).decode("utf-8"))
                if parsed.path == "/lookup":
                    ok = (
                        form.get("csrf_token", [""])[0] == "local-csrf-742"
                        and form.get("request_source", [""])[0] == "invoice_portal"
                        and form.get("order_id", [""])[0] == "A-1042"
                        and form.get("region", [""])[0] == "emea"
                    )
                    payload = {{
                        "marker": "FORM_LOOKUP_OK" if ok else "FORM_LOOKUP_RETRY",
                        "confirm_url": "/confirm?confirm_id=confirm-A-1042" if ok else None,
                        "query": {{"order_id": form.get("order_id", [""])[0], "region": form.get("region", [""])[0]}},
                    }}
                    body = json.dumps(payload, sort_keys=True).encode("utf-8")
                    self.send_response(200 if ok else 400)
                    self.send_header("Content-Type", "application/json")
                    self.send_header("Content-Length", str(len(body)))
                    self.end_headers()
                    self.wfile.write(body)
                    return
                self.send_response(404)
                self.end_headers()

            def log_message(self, fmt, *args):
                return

        ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
        """
    )
    proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(0.3)
    return {"MOCK_FORM_URL": f"http://127.0.0.1:{port}/", "server_pid": proc.pid}

def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
    pid = int(state.get("server_pid", 0) or 0)
    if pid:
        try:
            os.kill(pid, 15)
        except OSError:
            pass
LLM Rubric
USER_TEMPLATE = """This task: Use the local web page provided in the MOCK_FORM_URL environment variable. Do not use the public internet. Task: 1. Open the local page. 2. Extract the CSRF token and hidden fields from the form. 3. Submit the lookup for...

Key outputs: in/www/index.html., out/form_result.json, out/form_result.json., out/interaction_trace.md..

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def _sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()

def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})

def score_workspace(workspace: Path) -> dict[str, Any]:
    truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    index = workspace / "in" / "www" / "index.html"
    actual_index_hash = _sha256(index) if index.is_file() else ""
    _add(checks, "source_index", "www/index.html unchanged", actual_index_hash == _INDEX_SHA256, 1.0, f"got {actual_index_hash!r}")

    try:
        result = json.loads((workspace / "out" / "form_result.json").read_text(encoding="utf-8"))
    except Exception as exc:
        result = {}
        _add(checks, "json_parse", "out/form_result.json is valid JSON", False, 1.0, str(exc))
    else:
        _add(checks, "json_parse", "out/form_result.json is valid JSON", isinstance(result, dict), 1.0)

    _add(checks, "query", "query parameters are exact", result.get("query") == truth["query"], 2.0, f"got {result.get('query')!r}")
    _add(checks, "marker", "lookup marker is exact", result.get("marker") == truth["marker"], 2.0, f"got {result.get('marker')!r}")
    _add(checks, "confirm_marker", "confirmation marker is exact", result.get("confirm_marker") == truth["confirm_marker"], 2.0, f"got {result.get('confirm_marker')!r}")
    _add(checks, "result", "extracted result is exact", result.get("result") == truth["result"], 5.0, f"got {result.get('result')!r}")

    dumped = json.dumps(result, ensure_ascii=False)
    no_html = "<html" not in dumped.lower() and "FORM_PORTAL_READY" not in dumped and "Order Lookup" not in dumped
    _add(checks, "no_unrelated_page_text", "output excludes index HTML and unrelated page text", no_html, 1.0)

    log_path = workspace / "out" / "form_access.log"
    seen = log_path.read_text(encoding="utf-8", errors="replace").splitlines() if log_path.is_file() else []
    for path in truth["required_paths"]:
        _add(checks, "visited_" + (path.strip("/") or "index"), f"server log includes GET {path}", path in seen, 1.0, f"seen {seen!r}")
    trace = (workspace / "out" / "interaction_trace.md").read_text(encoding="utf-8", errors="replace").lower() if (workspace / "out" / "interaction_trace.md").is_file() else ""
    _add(checks, "interaction_trace", "interaction_trace.md records token, hidden field, submit, and confirmation", all(term.lower() in trace for term in truth["trace_terms"]), 2.0)

    score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
    return {"task": "023-web-form-extraction", "workspace": str(workspace), "checks": checks, "outcome_score": score}