Local REST API summary

Use the local REST API provided in the MOCK_API_BASE environment variable. Do not use the public internet.

Workspace, Tool Use & Multimodal OperationsTask 9Oracle + LLM scoring

Task ID022-local-rest-api-summary

DifficultyHard

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptWorkspace, Tool Use & Multimodal Operations · Task 9

Use the local REST API provided in the MOCK_API_BASE environment variable. Do not use the public internet.

Available endpoints:

GET $MOCK_API_BASE/projects?page=N
GET $MOCK_API_BASE/users?page=N
GET $MOCK_API_BASE/incidents?page=N

Each endpoint is paginated and returns JSON with items and next_page. Some first requests may return HTTP 429 or 503; retry locally with a short backoff.

Task:
Fetch all pages from all three endpoints, join the data, and write:

$WORKSPACE/out/api_summary.json
$WORKSPACE/out/data_quality_report.md

Required JSON format:
{
"source_base_url": "the exact base URL you used",
"project_count": 4,
"active_budget_usd": 26800,
"open_incidents": 5,
"projects": [
{
"id": "p-100",
"name": "Atlas Migration",
"owner_name": "Mira Chen",
"team": "Platform",
"status": "active",
"budget_usd": 12500,
"open_incident_count": 1,
"highest_open_severity": "high"
}
]
}

Rules:

Sort projects by id ascending.
active_budget_usd is the sum of budget_usd for projects whose status is "active".
open_incidents counts only incidents where open is true.
highest_open_severity ranks high > medium > low; use null if a project has no open incidents.
Save only the summary JSON. Do not include raw endpoint payloads or unrelated text.
Do not count duplicate project ids twice.
Use owner_name null and team null for missing owners, and mention them in data_quality_report.md.
Exclude orphan incidents from project counts but mention them in data_quality_report.md.

Input Files3 files

in/api_data/incidents.json

in/api_data/projects.json

in/api_data/users.json

Hooks

def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    data_dir = workspace / "in" / "api_data"
    log_path = workspace / "out" / "api_access.log"
    log_path.parent.mkdir(parents=True, exist_ok=True)
    port = 32000 + random.randint(0, 2000)
    script = textwrap.dedent(
        f"""
        import json
        from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
        from pathlib import Path
        from urllib.parse import urlparse

        DATA_DIR = Path({str(data_dir)!r})
        LOG_PATH = Path({str(log_path)!r})

        class Handler(BaseHTTPRequestHandler):
            def do_GET(self):
                path = urlparse(self.path).path
                LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
                with LOG_PATH.open("a", encoding="utf-8") as f:
                    f.write(path + "\n")
                mapping = {{
                    "/projects": "projects.json",
                    "/users": "users.json",
                    "/incidents": "incidents.json",
                }}
                if path not in mapping:
                    self.send_response(404)
                    self.end_headers()
                    self.wfile.write(b"not found")
                    return
                attempts = getattr(self.server, "attempts", {{}})
                key = self.path
                attempts[key] = attempts.get(key, 0) + 1
                self.server.attempts = attempts
                if path in {{"/projects", "/incidents"}} and attempts[key] == 1:
                    self.send_response(429 if path == "/projects" else 503)
                    self.end_headers()
                    self.wfile.write(b"retry later")
                    return
                all_items = json.loads((DATA_DIR / mapping[path]).read_text())
                from urllib.parse import parse_qs
                page = int(parse_qs(urlparse(self.path).query).get("page", ["1"])[0])
                size = 2
                start = (page - 1) * size
                items = all_items[start:start + size]
                next_page = page + 1 if start + size < len(all_items) else None
                body = json.dumps({{"items": items, "next_page": next_page}}, sort_keys=True).encode("utf-8")
                self.send_response(200)
                self.send_header("Content-Type", "application/json")
                self.send_header("Content-Length", str(len(body)))
                self.end_headers()
                self.wfile.write(body)

            def log_message(self, fmt, *args):
                return

        ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
        """
    )
    proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(0.3)
    return {"MOCK_API_BASE": f"http://127.0.0.1:{port}", "server_pid": proc.pid}

def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
    pid = int(state.get("server_pid", 0) or 0)
    if pid:
        try:
            os.kill(pid, 15)
        except OSError:
            pass

LLM Rubric

USER_TEMPLATE = """This task: Use the local REST API provided in the MOCK_API_BASE environment variable. Do not use the public internet. Available endpoints: - GET $MOCK_API_BASE/projects?page=N - GET $MOCK_API_BASE/users?page=N - GET $MOCK_API_BASE/...

Key outputs: out/api_summary.json, out/data_quality_report.md.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""

Completion Grader

def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})

def score_workspace(workspace: Path) -> dict[str, Any]:
    truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []
    out = workspace / "out" / "api_summary.json"

    try:
        summary = json.loads(out.read_text(encoding="utf-8"))
    except Exception as exc:
        summary = {}
        _add(checks, "json_parse", "out/api_summary.json is valid JSON", False, 1.0, str(exc))
    else:
        _add(checks, "json_parse", "out/api_summary.json is valid JSON", isinstance(summary, dict), 1.0)

    _add(checks, "source_url", "source_base_url records local API", isinstance(summary.get("source_base_url"), str) and summary["source_base_url"].startswith("http://127.0.0.1:"), 1.0, repr(summary.get("source_base_url")))
    for key in ("project_count", "active_budget_usd", "open_incidents"):
        _add(checks, key, f"{key} is correct", summary.get(key) == truth[key], 2.0, f"got {summary.get(key)!r}")

    _add(checks, "projects", "joined project rows are exact", summary.get("projects") == truth["projects"], 6.0, f"got {summary.get('projects')!r}")
    projects = summary.get("projects")
    sorted_ok = isinstance(projects, list) and [p.get("id") for p in projects] == sorted(p.get("id") for p in projects)
    _add(checks, "project_sort", "projects sorted by id", sorted_ok, 1.0)

    log_path = workspace / "out" / "api_access.log"
    seen = log_path.read_text(encoding="utf-8", errors="replace").splitlines() if log_path.is_file() else []
    for endpoint in truth["required_endpoints"]:
        _add(checks, "visited_" + endpoint.strip("/"), f"server log includes GET {endpoint}", endpoint in seen, 1.0, f"seen {seen!r}")
    retry_ok = seen.count("/projects") >= 2 and seen.count("/incidents") >= 2
    _add(checks, "retry_observed", "server log shows retry after transient failures", retry_ok, 2.0, f"seen {seen!r}")
    quality = (workspace / "out" / "data_quality_report.md").read_text(encoding="utf-8", errors="replace").lower() if (workspace / "out" / "data_quality_report.md").is_file() else ""
    _add(checks, "quality_report", "data_quality_report.md covers duplicates, missing owners, orphan incidents, and retries", all(term.lower() in quality for term in truth["quality_terms"]), 3.0)

    score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
    return {"task": "022-local-rest-api-summary", "workspace": str(workspace), "checks": checks, "outcome_score": score}

← Task 8 Task 10 →