Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations · Task 4
Task theme: Bilibili-style page (this is a local mock; it does not connect to the public bilibili.com). Working directory: $WORKSPACE.
Context
The harness has started a local HTTP server with static HTML that mimics a Bilibili space layout (fictional uploader: MockBili Demo Channel), with 3 submissions listed in descending play count order, to practice “open a Bilibili-like page → extract video titles”.
Goals
- Open
$MOCK_PAGEwith a browser or exec (e.g.curl). - Extract the three video titles in play-count order (high to low), matching top-to-bottom on the page, and write them to
$WORKSPACE/out/titles.txt: UTF-8, one title per line, no extra leading/trailing whitespace. - Write the page URL to
$WORKSPACE/out/source_url.txtas a single line. - Run
mkdir -p $WORKSPACE/outbefore writing outputs.
Verification
The page footer contains BENCHMARK_PAGE_MOCK_BILI to confirm you opened this fixture.
Input Files1 file
www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
port = 32000 + random.randint(0, 2000)
www = workspace / "www"
proc = subprocess.Popen(
["python3", "-m", "http.server", str(port), "--directory", str(www)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
time.sleep(0.3)
local_page = f"http://127.0.0.1:{port}/"
try:
public_page, tunnel_proc = _start_public_tunnel(local_page.rstrip("/"))
except Exception:
try:
proc.terminate()
except OSError:
pass
raise
mock_page = public_page + "/"
return {
"MOCK_PAGE": mock_page,
"server_pid": proc.pid,
"tunnel_pid": tunnel_proc.pid if tunnel_proc else 0,
}
def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
import os
for key in ("tunnel_pid", "server_pid"):
pid = int(state.get(key, 0) or 0)
if pid <= 0:
continue
try:
os.kill(pid, 15)
except OSError:
passLLM Rubric
USER_TEMPLATE = """This task: Task theme: **Bilibili-style page** (this is a **local mock**; it does not connect to the public bilibili.com). Working directory: `$WORKSPACE`. ## Context The harness has started a **local HTTP server** with static HTML...
Key outputs: out, out/source_url.txt, out/titles.txt.
Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def score_workspace(
workspace: Path,
*,
ground_truth_path: Path | None = None,
) -> dict[str, Any]:
w = workspace.resolve()
gt_path = ground_truth_path or _DEFAULT_GT
checks: list[dict[str, Any]] = []
if not gt_path.is_file():
return {
"task": "006-access-bilibili",
"workspace": str(w),
"checks": [],
"outcome_score": 0.0,
"error": f"missing ground_truth: {gt_path}",
}
gt = json.loads(gt_path.read_text(encoding="utf-8"))
min_lines = int(gt.get("min_line_count", 3))
min_c = int(gt.get("min_line_chars", 2))
max_c = int(gt.get("max_line_chars", 240))
expected: list[str] = list(gt.get("expected_ordered_titles") or [])
titles_path = w / "out" / "titles.txt"
lines: list[str] = []
if titles_path.is_file():
try:
raw = titles_path.read_text(encoding="utf-8", errors="strict")
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
except (OSError, UnicodeError):
lines = []
need = max(min_lines, len(expected)) if expected else min_lines
pass_file = titles_path.is_file() and len(lines) >= need
checks.append(
{
"id": "titles_min_lines",
"label": f"out/titles.txt exists with >= {need} non-empty lines",
"pass": pass_file,
"weight": 0.15,
"detail": None if pass_file else f"got {len(lines)}, need {need}",
}
)
prefix_ok = False
prefix_detail = None
if expected:
if len(lines) < len(expected):
prefix_detail = f"need {len(expected)} lines for prefix check, got {len(lines)}"
else:
mismatches = [
i
for i, exp in enumerate(expected)
if lines[i] != exp
]
if mismatches:
i = mismatches[0]
prefix_detail = f"line {i + 1}: expected {expected[i]!r}, got {lines[i]!r}"
else:
prefix_ok = True
else:
prefix_ok = True
checks.append(
{
"id": "expected_top_by_views",
"label": "first N lines match expected_ordered_titles (play-count desc snapshot)",
"pass": prefix_ok,
"weight": 0.5,
"detail": prefix_detail,
}
)
bad_fmt = False
detail_fmt = None
for i, ln in enumerate(lines):
if len(ln) < min_c or len(ln) > max_c:
bad_fmt = True
detail_fmt = f"line {i + 1} length {len(ln)} not in [{min_c},{max_c}]"
break
pass_fmt = bool(lines) and not bad_fmt
checks.append(
{
"id": "title_line_shape",
"label": f"each title line length in [{min_c},{max_c}]",
"pass": pass_fmt,
"weight": 0.2,
"detail": detail_fmt,
}
)
src = w / "out" / "source_url.txt"
url_ok = False
if src.is_file():
try:
u = src.read_text(encoding="utf-8", errors="replace").strip()
url_ok = len(u) >= 8 and ("http" in u or "bilibili" in u.lower())
except OSError:
url_ok = False
checks.append(
{
"id": "source_url",
"label": "out/source_url.txt non-empty and looks like a URL/page ref",
"pass": url_ok,
"weight": 0.15,
"detail": None if url_ok else "missing or too short",
}
)
outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
return {
"task": "006-access-bilibili",
"workspace": str(w),
"checks": checks,
"outcome_score": outcome,
}