Access A Local Bilibili-Style Mock Page

Task theme: Bilibili-style page (this is a local mock ; it does not connect to the public bilibili.com). Working directory: $WORKSPACE .

Workspace, Tool Use & Multimodal OperationsTask 4Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations · Task 4

Task theme: Bilibili-style page (this is a local mock; it does not connect to the public bilibili.com). Working directory: $WORKSPACE.

Context

The harness has started a local HTTP server with static HTML that mimics a Bilibili space layout (fictional uploader: MockBili Demo Channel), with 3 submissions listed in descending play count order, to practice “open a Bilibili-like page → extract video titles”.

Goals

  1. Open $MOCK_PAGE with a browser or exec (e.g. curl).
  2. Extract the three video titles in play-count order (high to low), matching top-to-bottom on the page, and write them to $WORKSPACE/out/titles.txt: UTF-8, one title per line, no extra leading/trailing whitespace.
  3. Write the page URL to $WORKSPACE/out/source_url.txt as a single line.
  4. Run mkdir -p $WORKSPACE/out before writing outputs.

Verification

The page footer contains BENCHMARK_PAGE_MOCK_BILI to confirm you opened this fixture.

Input Files1 file
www/index.html
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    port = 32000 + random.randint(0, 2000)
    www = workspace / "www"
    proc = subprocess.Popen(
        ["python3", "-m", "http.server", str(port), "--directory", str(www)],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    time.sleep(0.3)
    local_page = f"http://127.0.0.1:{port}/"
    try:
        public_page, tunnel_proc = _start_public_tunnel(local_page.rstrip("/"))
    except Exception:
        try:
            proc.terminate()
        except OSError:
            pass
        raise
    mock_page = public_page + "/"
    return {
        "MOCK_PAGE": mock_page,
        "server_pid": proc.pid,
        "tunnel_pid": tunnel_proc.pid if tunnel_proc else 0,
    }

def cleanup_runtime(runtime: dict[str, Any], state: dict[str, Any]) -> None:
    import os

    for key in ("tunnel_pid", "server_pid"):
        pid = int(state.get(key, 0) or 0)
        if pid <= 0:
            continue
        try:
            os.kill(pid, 15)
        except OSError:
            pass
LLM Rubric
USER_TEMPLATE = """This task: Task theme: **Bilibili-style page** (this is a **local mock**; it does not connect to the public bilibili.com). Working directory: `$WORKSPACE`. ## Context The harness has started a **local HTTP server** with static HTML...

Key outputs: out, out/source_url.txt, out/titles.txt.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def score_workspace(
    workspace: Path,
    *,
    ground_truth_path: Path | None = None,
) -> dict[str, Any]:
    w = workspace.resolve()
    gt_path = ground_truth_path or _DEFAULT_GT
    checks: list[dict[str, Any]] = []

    if not gt_path.is_file():
        return {
            "task": "006-access-bilibili",
            "workspace": str(w),
            "checks": [],
            "outcome_score": 0.0,
            "error": f"missing ground_truth: {gt_path}",
        }

    gt = json.loads(gt_path.read_text(encoding="utf-8"))
    min_lines = int(gt.get("min_line_count", 3))
    min_c = int(gt.get("min_line_chars", 2))
    max_c = int(gt.get("max_line_chars", 240))
    expected: list[str] = list(gt.get("expected_ordered_titles") or [])

    titles_path = w / "out" / "titles.txt"
    lines: list[str] = []
    if titles_path.is_file():
        try:
            raw = titles_path.read_text(encoding="utf-8", errors="strict")
            lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
        except (OSError, UnicodeError):
            lines = []

    need = max(min_lines, len(expected)) if expected else min_lines
    pass_file = titles_path.is_file() and len(lines) >= need
    checks.append(
        {
            "id": "titles_min_lines",
            "label": f"out/titles.txt exists with >= {need} non-empty lines",
            "pass": pass_file,
            "weight": 0.15,
            "detail": None if pass_file else f"got {len(lines)}, need {need}",
        }
    )

    prefix_ok = False
    prefix_detail = None
    if expected:
        if len(lines) < len(expected):
            prefix_detail = f"need {len(expected)} lines for prefix check, got {len(lines)}"
        else:
            mismatches = [
                i
                for i, exp in enumerate(expected)
                if lines[i] != exp
            ]
            if mismatches:
                i = mismatches[0]
                prefix_detail = f"line {i + 1}: expected {expected[i]!r}, got {lines[i]!r}"
            else:
                prefix_ok = True
    else:
        prefix_ok = True
    checks.append(
        {
            "id": "expected_top_by_views",
            "label": "first N lines match expected_ordered_titles (play-count desc snapshot)",
            "pass": prefix_ok,
            "weight": 0.5,
            "detail": prefix_detail,
        }
    )

    bad_fmt = False
    detail_fmt = None
    for i, ln in enumerate(lines):
        if len(ln) < min_c or len(ln) > max_c:
            bad_fmt = True
            detail_fmt = f"line {i + 1} length {len(ln)} not in [{min_c},{max_c}]"
            break
    pass_fmt = bool(lines) and not bad_fmt
    checks.append(
        {
            "id": "title_line_shape",
            "label": f"each title line length in [{min_c},{max_c}]",
            "pass": pass_fmt,
            "weight": 0.2,
            "detail": detail_fmt,
        }
    )

    src = w / "out" / "source_url.txt"
    url_ok = False
    if src.is_file():
        try:
            u = src.read_text(encoding="utf-8", errors="replace").strip()
            url_ok = len(u) >= 8 and ("http" in u or "bilibili" in u.lower())
        except OSError:
            url_ok = False
    checks.append(
        {
            "id": "source_url",
            "label": "out/source_url.txt non-empty and looks like a URL/page ref",
            "pass": url_ok,
            "weight": 0.15,
            "detail": None if url_ok else "missing or too short",
        }
    )

    outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
    return {
        "task": "006-access-bilibili",
        "workspace": str(w),
        "checks": checks,
        "outcome_score": outcome,
    }