Task 7 — Workspace, Tool Use & Multimodal Operations

Model Runs6 harnesses & 8 models evaluated on this task.

Loading...

PromptWorkspace, Tool Use & Multimodal Operations · Task 7

You are working in the task workspace. Use only local files.

Inputs:

$WORKSPACE/in/bundle_a.zip
$WORKSPACE/in/bundle_b.tar
$WORKSPACE/in/checksums.json

Task:

Extract both archives without modifying the archive files.
Compute size in bytes and sha256 for every regular file actually present in the archives.
Compare those files against $WORKSPACE/in/checksums.json.
Write $WORKSPACE/out/manifest.json and $WORKSPACE/out/mismatches.txt.

Required $WORKSPACE/out/manifest.json format:
{
"files": [
{"archive": "bundle_a.zip", "path": "data/metrics.csv", "size": 49, "sha256": "..."}
]
}

Rules:

The files array must contain every actual archived regular file exactly once.
Sort files by archive, then path.
path must be the path inside the archive, without leading ./.
Write $WORKSPACE/out/mismatches.txt as UTF-8 text, one row per mismatch, sorted exactly as:

archive,path,reason

Use reason "checksum_mismatch" for a present file whose sha256 differs from the checklist.
Use reason "missing" for a checklist entry not present in the extracted archive.
Do not list matching files in mismatches.txt.
Do not modify input files.

Input Files3 files

in/bundle_a.zip

in/bundle_b.tar

in/checksums.json

LLM Rubric

USER_TEMPLATE = """This task: You are working in the task workspace. Use only local files. Inputs: - $WORKSPACE/in/bundle_a.zip - $WORKSPACE/in/bundle_b.tar - $WORKSPACE/in/checksums.json Task: 1. Extract both archives without modifying the archive f...

Key outputs: in/bundle_a.zip, in/bundle_b.tar, in/checksums.json, in/checksums.json., out/manifest.json.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""

Completion Grader

def _check(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})

def _sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()

def _workspace_rel(rel: str) -> Path:
    path = Path(rel)
    if path.parts and path.parts[0] == "fixtures":
        return Path(*path.parts[1:])
    return path

def score_workspace(workspace: Path) -> dict[str, Any]:
    task_dir = Path(__file__).resolve().parent
    truth = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    # 原始归档文件未被修改
    for rel, expected_hash in truth["archive_hashes"].items():
        path = workspace / _workspace_rel(rel)
        actual = _sha256(path) if path.is_file() else ""
        _check(checks, "source_" + Path(rel).name, f"{rel} unchanged", actual == expected_hash, 1.0, f"got {actual!r}")

    # manifest.json 校验
    manifest_path = workspace / "out" / "manifest.json"
    manifest_ok = False
    try:
        manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
        manifest_ok = isinstance(manifest, dict)
        _check(checks, "manifest_parse", "out/manifest.json is valid JSON", manifest_ok, 1.0, None if manifest_ok else "not a dict")
    except Exception as exc:
        manifest = {}
        _check(checks, "manifest_parse", "out/manifest.json is valid JSON", False, 1.0, str(exc))


    if manifest_ok:
        expected_files = sorted(truth["manifest_files"], key=lambda x: (x["archive"], x["path"]))
        actual_files = manifest.get("files") if isinstance(manifest, dict) else None
        files_ok = actual_files == expected_files
        _check(checks, "manifest_files", "manifest lists exact archive/path/size/sha256 rows", files_ok, 5.0, f"got {actual_files!r}")

        sorted_ok = isinstance(actual_files, list) and actual_files == sorted(actual_files, key=lambda x: (x.get("archive", ""), x.get("path", "")))
        _check(checks, "manifest_sorted", "manifest files are sorted by archive then path", sorted_ok, 1.0)

    # mismatches.txt 读取与内容检查
    mismatch_path = workspace / "out" / "mismatches.txt"
    try:
        lines = [line.strip() for line in mismatch_path.read_text(encoding="utf-8").splitlines() if line.strip()]
        _check(checks, "mismatch_read", "out/mismatches.txt can be read", True, 1.0)
    except Exception as exc:
        lines = []
        _check(checks, "mismatch_read", "out/mismatches.txt can be read", False, 1.0, str(exc))


    expected_mismatches = truth["mismatches"]
    mismatch_ok = lines == expected_mismatches
    _check(checks, "mismatch_exact", "mismatches.txt contains only expected failing entries", mismatch_ok, 4.0, f"got {lines!r}")

    # 计算总分
    total_weight = len(truth["archive_hashes"]) * 1.0 + 1.0 + 5.0 + 1.0 + 1.0 + 4.0
    passed_weight = sum(c["weight"] for c in checks if c["pass"])
    score = passed_weight / total_weight if total_weight > 0 else 0.0

    return {
        "task": "020-archive-checksum",
        "workspace": str(workspace),
        "checks": checks,
        "outcome_score": score,
        "grade": "excellent" if score >= 0.9 else "good" if score >= 0.75 else "pass" if score >= 0.6 else "fail",
    }

Archive extraction and checksum audit