Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations ยท Task 10
Use the local web page provided in the MOCK_FORM_URL environment variable. Do not use the public internet.
Task:
- Open the local page.
- Extract the CSRF token and hidden fields from the form.
- Submit the lookup form with:
- order_id: A-1042
- region: emea
- Follow the confirmation step if returned.
- Save the extracted result to $WORKSPACE/out/form_result.json and write $WORKSPACE/out/interaction_trace.md.
Required JSON format:
{
"query": {"order_id": "A-1042", "region": "emea"},
"marker": "FORM_LOOKUP_OK",
"result": {
"order_id": "A-1042",
"region": "emea",
"customer": "Nadia Rossi",
"status": "ready_for_invoice",
"total_usd": 1842.75,
"line_count": 3
}
}
Rules:
- The marker must come from the lookup result page or response.
- Include
confirm_marker: "FORM_CONFIRM_OK"from the confirmation response. - Do not include unrelated page text, HTML, or the full index page in $WORKSPACE/out/form_result.json.
- Numeric values must be numbers, not strings.
- Do not modify $WORKSPACE/in/www/index.html.
- interaction_trace.md must mention the homepage visit, CSRF token, hidden field, lookup submission, confirmation step, and final marker.
Input Files1 file
in/www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
www = workspace / "in" / "www"
log_path = workspace / "out" / "form_access.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
port = 34000 + random.randint(0, 2000)
script = textwrap.dedent(
f"""
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from urllib.parse import parse_qs, urlparse
WWW = Path({str(www)!r})
LOG_PATH = Path({str(log_path)!r})
ORDER = {{
"order_id": "A-1042",
"region": "emea",
"customer": "Nadia Rossi",
"status": "ready_for_invoice",
"total_usd": 1842.75,
"line_count": 3,
}}
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
parsed = urlparse(self.path)
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(parsed.path + "\n")
if parsed.path == "/":
body = (WWW / "index.html").read_bytes()
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
if parsed.path == "/confirm":
qs = parse_qs(parsed.query)
ok = qs.get("confirm_id", [""])[0] == "confirm-A-1042"
payload = {{
"marker": "FORM_CONFIRM_OK" if ok else "FORM_CONFIRM_FAILED",
"result": ORDER if ok else None,
}}
body = json.dumps(payload, sort_keys=True).encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
self.send_response(404)
self.end_headers()
def do_POST(self):
parsed = urlparse(self.path)
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(parsed.path + "\n")
length = int(self.headers.get("Content-Length", "0") or "0")
form = parse_qs(self.rfile.read(length).decode("utf-8"))
if parsed.path == "/lookup":
ok = (
form.get("csrf_token", [""])[0] == "local-csrf-742"
and form.get("request_source", [""])[0] == "invoice_portal"
and form.get("order_id", [""])[0] == "A-1042"
and form.get("region", [""])[0] == "emea"
)
payload = {{
"marker": "FORM_LOOKUP_OK" if ok else "FORM_LOOKUP_RETRY",
"confirm_url": "/confirm?confirm_id=confirm-A-1042" if ok else None,
"query": {{"order_id": form.get("order_id", [""])[0], "region": form.get("region", [""])[0]}},
}}
body = json.dumps(payload, sort_keys=True).encode("utf-8")
self.send_response(200 if ok else 400)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
self.send_response(404)
self.end_headers()
def log_message(self, fmt, *args):
return
ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
"""
)
proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(0.3)
return {"MOCK_FORM_URL": f"http://127.0.0.1:{port}/", "server_pid": proc.pid}
def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
pid = int(state.get("server_pid", 0) or 0)
if pid:
try:
os.kill(pid, 15)
except OSError:
passLLM Rubric
USER_TEMPLATE = """This task: Use the local web page provided in the MOCK_FORM_URL environment variable. Do not use the public internet. Task: 1. Open the local page. 2. Extract the CSRF token and hidden fields from the form. 3. Submit the lookup for...
Key outputs: in/www/index.html., out/form_result.json, out/form_result.json., out/interaction_trace.md..
Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})
def score_workspace(workspace: Path) -> dict[str, Any]:
truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
index = workspace / "in" / "www" / "index.html"
actual_index_hash = _sha256(index) if index.is_file() else ""
_add(checks, "source_index", "www/index.html unchanged", actual_index_hash == _INDEX_SHA256, 1.0, f"got {actual_index_hash!r}")
try:
result = json.loads((workspace / "out" / "form_result.json").read_text(encoding="utf-8"))
except Exception as exc:
result = {}
_add(checks, "json_parse", "out/form_result.json is valid JSON", False, 1.0, str(exc))
else:
_add(checks, "json_parse", "out/form_result.json is valid JSON", isinstance(result, dict), 1.0)
_add(checks, "query", "query parameters are exact", result.get("query") == truth["query"], 2.0, f"got {result.get('query')!r}")
_add(checks, "marker", "lookup marker is exact", result.get("marker") == truth["marker"], 2.0, f"got {result.get('marker')!r}")
_add(checks, "confirm_marker", "confirmation marker is exact", result.get("confirm_marker") == truth["confirm_marker"], 2.0, f"got {result.get('confirm_marker')!r}")
_add(checks, "result", "extracted result is exact", result.get("result") == truth["result"], 5.0, f"got {result.get('result')!r}")
dumped = json.dumps(result, ensure_ascii=False)
no_html = "<html" not in dumped.lower() and "FORM_PORTAL_READY" not in dumped and "Order Lookup" not in dumped
_add(checks, "no_unrelated_page_text", "output excludes index HTML and unrelated page text", no_html, 1.0)
log_path = workspace / "out" / "form_access.log"
seen = log_path.read_text(encoding="utf-8", errors="replace").splitlines() if log_path.is_file() else []
for path in truth["required_paths"]:
_add(checks, "visited_" + (path.strip("/") or "index"), f"server log includes GET {path}", path in seen, 1.0, f"seen {seen!r}")
trace = (workspace / "out" / "interaction_trace.md").read_text(encoding="utf-8", errors="replace").lower() if (workspace / "out" / "interaction_trace.md").is_file() else ""
_add(checks, "interaction_trace", "interaction_trace.md records token, hidden field, submit, and confirmation", all(term.lower() in trace for term in truth["trace_terms"]), 2.0)
score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
return {"task": "023-web-form-extraction", "workspace": str(workspace), "checks": checks, "outcome_score": score}