Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 22
Use only the task-provided mock API at $MOCK_API_BASE. It may be exposed through a public tunnel to a local benchmark server. Do not use any other internet resources.
The Python client in $WORKSPACE/in/apiclient/client.py is incompatible with the current API contract.
Read:
$WORKSPACE/in/apiclient/contract.md$WORKSPACE/in/apiclient/sample_requests.json
Your job:
- Fix
$WORKSPACE/in/apiclient/client.py. - Run
python -m pytest testsfrom$WORKSPACE/in/apiclient. - Exercise the mock API at
$MOCK_API_BASEwhile validating your fix. - Write
$WORKSPACE/out/compat_report.mddescribing v1/v2 mapping, pagination, retry, nullable fields, and error handling.
Required behavior:
- Support
/v1/userspage-number responses and/v2/userscursor responses. - Normalize both user shapes to
{id, name, email, plan}. - Treat missing or null email as
None. - Retry local HTTP 429 responses with a short backoff.
- Convert API error envelopes into
ApiErrorwith status and message. - Do not hard-code the mock fixture answers or base URL.
Constraints:
- Do not modify
contract.md,sample_requests.json,tests, or../mock_api. - Do not use network requests except the task-provided mock API URL.
- Do not remove retry or error handling.
- Do not introduce external dependencies such as requests.
Input Files8 files
in/apiclient/client.py
in/apiclient/contract.md
in/apiclient/sample_requests.json
in/apiclient/tests/test_client_unit.py
in/mock_api/users_v1_page1.json
in/mock_api/users_v1_page2.json
in/mock_api/users_v2_c2.json
in/mock_api/users_v2_start.json
Hooks
def _start_public_tunnel(local_url: str) -> tuple[str | None, subprocess.Popen[str] | None]:
public_url_template = os.environ.get("HARNESSBENCH_PUBLIC_URL_TEMPLATE", "").strip()
if public_url_template:
return public_url_template.format(local_url=local_url).rstrip("/"), None
tunnel_cmd = os.environ.get("HARNESSBENCH_TUNNEL_CMD", "").strip()
if not tunnel_cmd and shutil.which("cloudflared"):
tunnel_cmd = "cloudflared tunnel --url {local_url} --no-autoupdate"
if not tunnel_cmd:
raise RuntimeError(
"no public mock URL configured: install cloudflared or set "
"HARNESSBENCH_PUBLIC_URL_TEMPLATE / HARNESSBENCH_TUNNEL_CMD"
)
rendered = tunnel_cmd.format(local_url=local_url)
proc = subprocess.Popen(
shlex.split(rendered),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
deadline = time.time() + 15.0
captured: list[str] = []
while time.time() < deadline:
if proc.poll() is not None:
break
line = proc.stdout.readline() if proc.stdout else ""
if not line:
time.sleep(0.1)
continue
captured.append(line.rstrip("\n"))
cf_match = _TRYCLOUDFLARE_RE.search(line)
if cf_match:
return cf_match.group(0).rstrip("/"), proc
match = _URL_RE.search(line)
if match and "trycloudflare.com" in match.group(0).lower():
return match.group(0).rstrip("/"), proc
try:
proc.terminate()
except OSError:
pass
raise RuntimeError(
"failed to discover public tunnel URL from HARNESSBENCH_TUNNEL_CMD output: "
+ " | ".join(captured[-5:])
)
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
data_dir = workspace / "in" / "mock_api"
log_path = workspace / "out" / "api_access.log"
port = 34500 + random.randint(0, 2000)
script = textwrap.dedent(
f"""
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from urllib.parse import parse_qs, urlparse
DATA = Path({str(data_dir)!r})
LOG = Path({str(log_path)!r})
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
parsed = urlparse(self.path)
LOG.parent.mkdir(parents=True, exist_ok=True)
with LOG.open("a", encoding="utf-8") as f:
f.write(self.path + "\\n")
attempts = getattr(self.server, "attempts", {{}})
attempts[self.path] = attempts.get(self.path, 0) + 1
self.server.attempts = attempts
if parsed.path == "/v2/users" and attempts[self.path] == 1:
self.send_response(429)
self.end_headers()
self.wfile.write(b"retry later")
return
qs = parse_qs(parsed.query)
status = 200
if parsed.path == "/v1/users":
page = qs.get("page", ["1"])[0]
file_name = "users_v1_page2.json" if page == "2" else "users_v1_page1.json"
body = (DATA / file_name).read_text(encoding="utf-8")
elif parsed.path == "/v2/users":
cursor = qs.get("cursor", [""])[0]
if cursor == "bad":
status = 400
body = json.dumps({{"error": {{"code": "bad_cursor", "message": "Bad cursor"}}}})
else:
file_name = "users_v2_c2.json" if cursor == "c2" else "users_v2_start.json"
body = (DATA / file_name).read_text(encoding="utf-8")
else:
status = 404
body = json.dumps({{"error": {{"message": "not found"}}}})
raw = body.encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(raw)))
self.end_headers()
self.wfile.write(raw)
def log_message(self, fmt, *args):
return
ThreadingHTTPServer(("127.0.0.1", {port}), Handler).serve_forever()
"""
)
proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(0.3)
local_api = f"http://127.0.0.1:{port}"
try:
public_api, tunnel_proc = _start_public_tunnel(local_api)
except Exception:
try:
proc.terminate()
except OSError:
pass
raise
return {"MOCK_API_BASE": public_api, "server_pid": proc.pid, "tunnel_pid": tunnel_proc.pid if tunnel_proc else 0}
def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
for key in ("tunnel_pid", "server_pid"):
pid = int(state.get(key, 0) or 0)
if not pid:
continue
try:
os.kill(pid, 15)
except OSError:
passLLM Rubric
USER_TEMPLATE = """This task: Repair Client Compatibility Against a Mock API Contract. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.
Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def _md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
def score_workspace(workspace: Path) -> dict[str, Any]:
root = Path(workspace).resolve()
project = root / "in" / "apiclient"
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})
env = os.environ.copy()
env["PYTHONPATH"] = str(project)
result = subprocess.run([sys.executable, "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
pytest_score = 1.0 if result.returncode == 0 else 0.0
add("pytest", result.returncode == 0, 0.20, result.stdout[-800:] + result.stderr[-800:])
hidden_score = 0.0
try:
port = "36687"
data_dir = root / "in" / "mock_api"
script = textwrap.dedent(f"""
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from urllib.parse import parse_qs, urlparse
DATA = Path({str(data_dir)!r})
class H(BaseHTTPRequestHandler):
def do_GET(self):
p = urlparse(self.path); q = parse_qs(p.query)
attempts = getattr(self.server, "attempts", {{}})
attempts[self.path] = attempts.get(self.path, 0) + 1
self.server.attempts = attempts
status = 200
if p.path == "/v2/users" and attempts[self.path] == 1:
self.send_response(429); self.end_headers(); self.wfile.write(b"retry"); return
if p.path == "/v1/users":
body = (DATA / ("users_v1_page2.json" if q.get("page", ["1"])[0] == "2" else "users_v1_page1.json")).read_text()
elif p.path == "/v2/users":
c = q.get("cursor", [""])[0]
if c == "bad":
status = 400; body = json.dumps({{"error": {{"message": "Bad cursor"}}}})
else:
body = (DATA / ("users_v2_c2.json" if c == "c2" else "users_v2_start.json")).read_text()
else:
status = 404; body = json.dumps({{"error": {{"message": "not found"}}}})
raw = body.encode()
self.send_response(status); self.send_header("Content-Type", "application/json"); self.send_header("Content-Length", str(len(raw))); self.end_headers(); self.wfile.write(raw)
def log_message(self, *args): return
ThreadingHTTPServer(("127.0.0.1", {port}), H).serve_forever()
""")
proc = subprocess.Popen([sys.executable, "-c", script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(0.2)
check = """
from client import ApiError, list_users
base = "http://127.0.0.1:36687"
expected = [
{"id": "u1", "name": "Ada Lovelace", "email": "ada@example.com", "plan": "pro"},
{"id": "u2", "name": "Noor Khan", "email": None, "plan": "free"},
{"id": "u3", "name": "Mira Chen", "email": None, "plan": "team"},
]
assert list_users(base, version="v1") == expected
assert list_users(base, version="v2") == expected
try:
list_users(base + "/v2/users?cursor=bad", version="v2")
except Exception:
pass
from client import _get_json
try:
_get_json(base + "/v2/users?cursor=bad")
except ApiError as exc:
assert exc.status == 400 and "bad cursor" in exc.message.lower()
else:
raise AssertionError("bad cursor did not raise")
"""
hidden = subprocess.run([sys.executable, "-c", check], cwd=project, env=env, capture_output=True, text=True, timeout=20)
hidden_score = 1.0 if hidden.returncode == 0 else 0.0
add("hidden_api_compat", hidden_score == 1.0, 0.40, hidden.stdout[-500:] + hidden.stderr[-500:])
except Exception as exc:
add("hidden_api_compat", False, 0.40, str(exc))
finally:
try:
proc.terminate() # type: ignore[name-defined]
except Exception:
pass
access = root / "out" / "api_access.log"
access_text = access.read_text(encoding="utf-8", errors="replace") if access.is_file() else ""
access_score = sum(term in access_text for term in ["/v1/users", "/v2/users"]) / 2
add("local_api_was_used", access_score >= 0.5, 0.10, access_text[-500:])
integrity_items = []
for rel, digest in HASHES.items():
path = project / rel
integrity_items.append(path.is_file() and _md5(path) == digest)
integrity = sum(integrity_items) / len(integrity_items)
add("fixture_integrity", integrity == 1.0, 0.10, {"score": integrity})
source = (project / "client.py").read_text(encoding="utf-8", errors="replace")
quality = (
0.2 * ("urllib" in source)
+ 0.2 * ("429" in source or "HTTPError" in source)
+ 0.2 * ("next_page" in source and "nextCursor" in source)
+ 0.2 * ("full_name" in source and "displayName" in source)
+ 0.2 * ("requests" not in source)
)
add("implementation_quality", quality >= 0.8, 0.10, {"score": quality})
report = root / "out" / "compat_report.md"
text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
report_score = sum(term in text for term in ["pagination", "retry", "v1", "v2", "error"]) / 5
add("compat_report", report_score >= 0.8, 0.10, {"score": report_score})
total = pytest_score * 0.20 + hidden_score * 0.40 + access_score * 0.10 + integrity * 0.10 + quality * 0.10 + report_score * 0.10
if hidden_score < 1.0:
total = min(total, 0.65)
if integrity < 1.0:
total = min(total, 0.55)
level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
return {"task": "088-api-contract-mock-client-compat", "outcome_score": round(total, 4), "level": level, "checks": checks}