Repair Client Compatibility Against a Mock API Contract

Use only the task-provided mock API at $MOCK_API_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use any other internet resources.

Software Engineering & Codebase MaintenanceTask 22Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 22

Use only the task-provided mock API at $MOCK_API_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use any other internet resources.

The Python client in $WORKSPACE/in/apiclient/client.py is incompatible with the current API contract.

Read:

  • $WORKSPACE/in/apiclient/contract.md
  • $WORKSPACE/in/apiclient/sample_requests.json

Your job:

  • Fix $WORKSPACE/in/apiclient/client.py.
  • Run python -m pytest tests from $WORKSPACE/in/apiclient.
  • Exercise the mock API at $MOCK_API_BASE while validating your fix.
  • Write $WORKSPACE/out/compat_report.md describing v1/v2 mapping, pagination, retry, nullable fields, and error handling.

Required behavior:

  • Support /v1/users page-number responses and /v2/users cursor responses.
  • Normalize both user shapes to {id, name, email, plan}.
  • Treat missing or null email as None.
  • Retry local HTTP 429 responses with a short backoff.
  • Convert API error envelopes into ApiError with status and message.
  • Do not hard-code the mock fixture answers or base URL.

Constraints:

  • Do not modify contract.md, sample_requests.json, tests, or ../mock_api.
  • Do not use network requests except the task-provided mock API URL.
  • Do not remove retry or error handling.
  • Do not introduce external dependencies such as requests.
Input Files8 files
in/apiclient/client.py
in/apiclient/contract.md
in/apiclient/sample_requests.json
in/apiclient/tests/test_client_unit.py
in/mock_api/users_v1_page1.json
in/mock_api/users_v1_page2.json
in/mock_api/users_v2_c2.json
in/mock_api/users_v2_start.json
Hooks
def _start_public_tunnel(local_url: str) -> tuple[str | None, subprocess.Popen[str] | None]:
    public_url_template = os.environ.get("HARNESSBENCH_PUBLIC_URL_TEMPLATE", "").strip()
    if public_url_template:
        return public_url_template.format(local_url=local_url).rstrip("/"), None

    tunnel_cmd = os.environ.get("HARNESSBENCH_TUNNEL_CMD", "").strip()
    if not tunnel_cmd and shutil.which("cloudflared"):
        tunnel_cmd = "cloudflared tunnel --url {local_url} --no-autoupdate"
    if not tunnel_cmd:
        raise RuntimeError(
            "no public mock URL configured: install cloudflared or set "
            "HARNESSBENCH_PUBLIC_URL_TEMPLATE / HARNESSBENCH_TUNNEL_CMD"
        )

    rendered = tunnel_cmd.format(local_url=local_url)
    proc = subprocess.Popen(
        shlex.split(rendered),
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
    )

    deadline = time.time() + 15.0
    captured: list[str] = []
    while time.time() < deadline:
        if proc.poll() is not None:
            break
        line = proc.stdout.readline() if proc.stdout else ""
        if not line:
            time.sleep(0.1)
            continue
        captured.append(line.rstrip("\n"))
        cf_match = _TRYCLOUDFLARE_RE.search(line)
        if cf_match:
            return cf_match.group(0).rstrip("/"), proc
        match = _URL_RE.search(line)
        if match and "trycloudflare.com" in match.group(0).lower():
            return match.group(0).rstrip("/"), proc

    try:
        proc.terminate()
    except OSError:
        pass
    raise RuntimeError(
        "failed to discover public tunnel URL from HARNESSBENCH_TUNNEL_CMD output: "
        + " | ".join(captured[-5:])
    )


def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    data_dir = workspace / "in" / "mock_api"
    log_path = workspace / "out" / "api_access.log"
    port = 34500 + random.randint(0, 2000)
    script = textwrap.dedent(
        f"""
        import json
        from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
        from pathlib import Path
        from urllib.parse import parse_qs, urlparse

        DATA = Path({str(data_dir)!r})
        LOG = Path({str(log_path)!r})

        class Handler(BaseHTTPRequestHandler):
            def do_GET(self):
                parsed = urlparse(self.path)
                LOG.parent.mkdir(parents=True, exist_ok=True)
                with LOG.open("a", encoding="utf-8") as f:
                    f.write(self.path + "\\n")
                attempts = getattr(self.server, "attempts", {{}})
                attempts[self.path] = attempts.get(self.path, 0) + 1
                self.server.attempts = attempts
                if parsed.path == "/v2/users" and attempts[self.path] == 1:
                    self.send_response(429)
                    self.end_headers()
                    self.wfile.write(b"retry later")
                    return
                qs = parse_qs(parsed.query)
                status = 200
                if parsed.path == "/v1/users":
                    page = qs.get("page", ["1"])[0]
                    file_name = "users_v1_page2.json" if page == "2" else "users_v1_page1.json"
                    body = (DATA / file_name).read_text(encoding="utf-8")
                elif parsed.path == "/v2/users":
                    cursor = qs.get("cursor", [""])[0]
                    if cursor == "bad":
                        status = 400
                        body = json.dumps({{"error": {{"code": "bad_cursor", "message": "Bad cursor"}}}})
                    else:
                        file_name = "users_v2_c2.json" if cursor == "c2" else "users_v2_start.json"
                        body = (DATA / file_name).read_text(encoding="utf-8")
                else:
                    status = 404
                    body = json.dumps({{"error": {{"message": "not found"}}}})
                raw = body.encode("utf-8")
                self.send_response(status)
                self.send_header("Content-Type", "application/json")
                self.send_header("Content-Length", str(len(raw)))
                self.end_headers()
                self.wfile.write(raw)

            def log_message(self, fmt, *args):
                return

        ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
        """
    )
    proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(0.3)
    local_api = f"http://127.0.0.1:{port}"
    try:
        public_api, tunnel_proc = _start_public_tunnel(local_api)
    except Exception:
        try:
            proc.terminate()
        except OSError:
            pass
        raise
    return {"MOCK_API_BASE": public_api, "server_pid": proc.pid, "tunnel_pid": tunnel_proc.pid if tunnel_proc else 0}


def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
    for key in ("tunnel_pid", "server_pid"):
        pid = int(state.get(key, 0) or 0)
        if not pid:
            continue
        try:
            os.kill(pid, 15)
        except OSError:
            pass
LLM Rubric
USER_TEMPLATE = """This task: Repair Client Compatibility Against a Mock API Contract. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.

Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def score_workspace(workspace: Path) -> dict[str, Any]:
    root = Path(workspace).resolve()
    project = root / "in" / "apiclient"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    env = os.environ.copy()
    env["PYTHONPATH"] = str(project)
    result = subprocess.run([sys.executable, "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
    pytest_score = 1.0 if result.returncode == 0 else 0.0
    add("pytest", result.returncode == 0, 0.20, result.stdout[-800:] + result.stderr[-800:])

    hidden_score = 0.0
    try:
        port = "36687"
        data_dir = root / "in" / "mock_api"
        script = textwrap.dedent(f"""
        import json
        from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
        from pathlib import Path
        from urllib.parse import parse_qs, urlparse
        DATA = Path({str(data_dir)!r})
        class H(BaseHTTPRequestHandler):
            def do_GET(self):
                p = urlparse(self.path); q = parse_qs(p.query)
                attempts = getattr(self.server, "attempts", {{}})
                attempts[self.path] = attempts.get(self.path, 0) + 1
                self.server.attempts = attempts
                status = 200
                if p.path == "/v2/users" and attempts[self.path] == 1:
                    self.send_response(429); self.end_headers(); self.wfile.write(b"retry"); return
                if p.path == "/v1/users":
                    body = (DATA / ("users_v1_page2.json" if q.get("page", ["1"])[0] == "2" else "users_v1_page1.json")).read_text()
                elif p.path == "/v2/users":
                    c = q.get("cursor", [""])[0]
                    if c == "bad":
                        status = 400; body = json.dumps({{"error": {{"message": "Bad cursor"}}}})
                    else:
                        body = (DATA / ("users_v2_c2.json" if c == "c2" else "users_v2_start.json")).read_text()
                else:
                    status = 404; body = json.dumps({{"error": {{"message": "not found"}}}})
                raw = body.encode()
                self.send_response(status); self.send_header("Content-Type", "application/json"); self.send_header("Content-Length", str(len(raw))); self.end_headers(); self.wfile.write(raw)
            def log_message(self, *args): return
        ThreadingHTTPServer(("127.0.0.1", {port}), H).serve_forever()
        """)
        proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        time.sleep(0.2)
        check = """
from client import ApiError, list_users
base = "http://127.0.0.1:36687"
expected = [
    {"id": "u1", "name": "Ada Lovelace", "email": "ada@example.com", "plan": "pro"},
    {"id": "u2", "name": "Noor Khan", "email": None, "plan": "free"},
    {"id": "u3", "name": "Mira Chen", "email": None, "plan": "team"},
]
assert list_users(base, version="v1") == expected
assert list_users(base, version="v2") == expected
try:
    list_users(base + "/v2/users?cursor=bad", version="v2")
except Exception:
    pass
from client import _get_json
try:
    _get_json(base + "/v2/users?cursor=bad")
except ApiError as exc:
    assert exc.status == 400 and "bad cursor" in exc.message.lower()
else:
    raise AssertionError("bad cursor did not raise")
"""
        hidden = subprocess.run([sys.executable, "-c", check], cwd=project, env=env, capture_output=True, text=True, timeout=20)
        hidden_score = 1.0 if hidden.returncode == 0 else 0.0
        add("hidden_api_compat", hidden_score == 1.0, 0.40, hidden.stdout[-500:] + hidden.stderr[-500:])
    except Exception as exc:
        add("hidden_api_compat", False, 0.40, str(exc))
    finally:
        try:
            proc.terminate()  # type: ignore[name-defined]
        except Exception:
            pass

    access = root / "out" / "api_access.log"
    access_text = access.read_text(encoding="utf-8", errors="replace") if access.is_file() else ""
    access_score = sum(term in access_text for term in ["/v1/users", "/v2/users"]) / 2
    add("local_api_was_used", access_score >= 0.5, 0.10, access_text[-500:])

    integrity_items = []
    for rel, digest in HASHES.items():
        path = project / rel
        integrity_items.append(path.is_file() and _md5(path) == digest)
    integrity = sum(integrity_items) / len(integrity_items)
    add("fixture_integrity", integrity == 1.0, 0.10, {"score": integrity})

    source = (project / "client.py").read_text(encoding="utf-8", errors="replace")
    quality = (
        0.2 * ("urllib" in source)
        + 0.2 * ("429" in source or "HTTPError" in source)
        + 0.2 * ("next_page" in source and "nextCursor" in source)
        + 0.2 * ("full_name" in source and "displayName" in source)
        + 0.2 * ("requests" not in source)
    )
    add("implementation_quality", quality >= 0.8, 0.10, {"score": quality})

    report = root / "out" / "compat_report.md"
    text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
    report_score = sum(term in text for term in ["pagination", "retry", "v1", "v2", "error"]) / 5
    add("compat_report", report_score >= 0.8, 0.10, {"score": report_score})

    total = pytest_score * 0.20 + hidden_score * 0.40 + access_score * 0.10 + integrity * 0.10 + quality * 0.10 + report_score * 0.10
    if hidden_score < 1.0:
        total = min(total, 0.65)
    if integrity < 1.0:
        total = min(total, 0.55)
    level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
    return {"task": "088-api-contract-mock-client-compat", "outcome_score": round(total, 4), "level": level, "checks": checks}