Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations · Task 7
You are working in the task workspace. Use only local files.
Inputs:
- $WORKSPACE/in/bundle_a.zip
- $WORKSPACE/in/bundle_b.tar
- $WORKSPACE/in/checksums.json
Task:
- Extract both archives without modifying the archive files.
- Compute size in bytes and sha256 for every regular file actually present in the archives.
- Compare those files against $WORKSPACE/in/checksums.json.
- Write $WORKSPACE/out/manifest.json and $WORKSPACE/out/mismatches.txt.
Required $WORKSPACE/out/manifest.json format:
{
"files": [
{"archive": "bundle_a.zip", "path": "data/metrics.csv", "size": 49, "sha256": "..."}
]
}
Rules:
- The files array must contain every actual archived regular file exactly once.
- Sort files by archive, then path.
- path must be the path inside the archive, without leading ./.
- Write $WORKSPACE/out/mismatches.txt as UTF-8 text, one row per mismatch, sorted exactly as:
archive,path,reason
- Use reason "checksum_mismatch" for a present file whose sha256 differs from the checklist.
- Use reason "missing" for a checklist entry not present in the extracted archive.
- Do not list matching files in mismatches.txt.
- Do not modify input files.
Input Files3 files
in/bundle_a.zip
in/bundle_b.tar
in/checksums.json
LLM Rubric
USER_TEMPLATE = """This task: You are working in the task workspace. Use only local files. Inputs: - $WORKSPACE/in/bundle_a.zip - $WORKSPACE/in/bundle_b.tar - $WORKSPACE/in/checksums.json Task: 1. Extract both archives without modifying the archive f...
Key outputs: in/bundle_a.zip, in/bundle_b.tar, in/checksums.json, in/checksums.json., out/manifest.json.
Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def _check(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def _workspace_rel(rel: str) -> Path:
path = Path(rel)
if path.parts and path.parts[0] == "fixtures":
return Path(*path.parts[1:])
return path
def score_workspace(workspace: Path) -> dict[str, Any]:
task_dir = Path(__file__).resolve().parent
truth = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
# 原始归档文件未被修改
for rel, expected_hash in truth["archive_hashes"].items():
path = workspace / _workspace_rel(rel)
actual = _sha256(path) if path.is_file() else ""
_check(checks, "source_" + Path(rel).name, f"{rel} unchanged", actual == expected_hash, 1.0, f"got {actual!r}")
# manifest.json 校验
manifest_path = workspace / "out" / "manifest.json"
manifest_ok = False
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
manifest_ok = isinstance(manifest, dict)
_check(checks, "manifest_parse", "out/manifest.json is valid JSON", manifest_ok, 1.0, None if manifest_ok else "not a dict")
except Exception as exc:
manifest = {}
_check(checks, "manifest_parse", "out/manifest.json is valid JSON", False, 1.0, str(exc))
if manifest_ok:
expected_files = sorted(truth["manifest_files"], key=lambda x: (x["archive"], x["path"]))
actual_files = manifest.get("files") if isinstance(manifest, dict) else None
files_ok = actual_files == expected_files
_check(checks, "manifest_files", "manifest lists exact archive/path/size/sha256 rows", files_ok, 5.0, f"got {actual_files!r}")
sorted_ok = isinstance(actual_files, list) and actual_files == sorted(actual_files, key=lambda x: (x.get("archive", ""), x.get("path", "")))
_check(checks, "manifest_sorted", "manifest files are sorted by archive then path", sorted_ok, 1.0)
# mismatches.txt 读取与内容检查
mismatch_path = workspace / "out" / "mismatches.txt"
try:
lines = [line.strip() for line in mismatch_path.read_text(encoding="utf-8").splitlines() if line.strip()]
_check(checks, "mismatch_read", "out/mismatches.txt can be read", True, 1.0)
except Exception as exc:
lines = []
_check(checks, "mismatch_read", "out/mismatches.txt can be read", False, 1.0, str(exc))
expected_mismatches = truth["mismatches"]
mismatch_ok = lines == expected_mismatches
_check(checks, "mismatch_exact", "mismatches.txt contains only expected failing entries", mismatch_ok, 4.0, f"got {lines!r}")
# 计算总分
total_weight = len(truth["archive_hashes"]) * 1.0 + 1.0 + 5.0 + 1.0 + 1.0 + 4.0
passed_weight = sum(c["weight"] for c in checks if c["pass"])
score = passed_weight / total_weight if total_weight > 0 else 0.0
return {
"task": "020-archive-checksum",
"workspace": str(workspace),
"checks": checks,
"outcome_score": score,
"grade": "excellent" if score >= 0.9 else "good" if score >= 0.75 else "pass" if score >= 0.6 else "fail",
}