Inspect Two Local Images And Save Short Answers

Working directory: $WORKSPACE .

Workspace, Tool Use & Multimodal OperationsTask 5Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations · Task 5

Working directory: $WORKSPACE.

Absolute paths to two images (keep them in the message so vision-capable models can load them):

Image 1: $IMAGE1_ABS_PATH
Image 2: $IMAGE2_ABS_PATH

Tasks:

  1. Inspect each image and answer from what you see (short English; you may join a few very short phrases on one line with commas—do not need to cover everything, but avoid single-word non-answers).
  • Image 1: besides the geometric shape, include main color and background.
  • Image 2: besides species/object, include main colors, and at least one of foreground or background cues you can see.
  1. Write the answer for image 1 to $WORKSPACE/out/image1_answer.txt: exactly one line.
  2. Write the answer for image 2 to $WORKSPACE/out/image2_answer.txt: exactly one line.
  3. If missing: mkdir -p $WORKSPACE/out.
Input Files2 files
image/target1.png
image/target2.jpg
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(runtime["workspace"])
    task_dir = Path(runtime["task"].task_dir)
    data = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
    files = list(data.get("image_files") or [])
    state: dict[str, Any] = {}
    for index, name in enumerate(files, start=1):
        state[f"IMAGE{index}_ABS_PATH"] = str((workspace / "image" / name).resolve())
    return state
LLM Rubric
USER_TEMPLATE = """This task: Working directory: `$WORKSPACE`. Absolute paths to two images (keep them in the message so vision-capable models can load them): Image 1: `$IMAGE1_ABS_PATH` Image 2: `$IMAGE2_ABS_PATH` Tasks: 1. **Inspect each image** an...

Key outputs: out, out/image1_answer.txt, out/image2_answer.txt.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt_path = ground_truth_path or (TASK_DIR / "ground_truth.json")
    gt = json.loads(gt_path.read_text(encoding="utf-8"))
    files = list(gt.get("image_files") or ["target1.png", "target2.jpg"])
    refs = gt.get("rubric_reference") or {}

    checks = []
    for rel in ("out/image1_answer.txt", "out/image2_answer.txt"):
        path = w / rel
        body = ""
        if path.is_file():
            body = path.read_text(encoding="utf-8", errors="replace").strip()
        checks.append(
            {
                "id": rel.replace("/", "_"),
                "label": f"{rel} exists and is non-empty",
                "pass": bool(body),
                "weight": 0.5,
                "detail": None if body else "missing or empty",
            }
        )
    outcome_score = round(sum(c["weight"] for c in checks if c["pass"]), 4)

    a1_path, a2_path = w / "out/image1_answer.txt", w / "out/image2_answer.txt"
    ans1 = a1_path.read_text(encoding="utf-8", errors="replace").strip() if a1_path.is_file() else ""
    ans2 = a2_path.read_text(encoding="utf-8", errors="replace").strip() if a2_path.is_file() else ""
    img1 = files[0] if len(files) > 0 else "?"
    img2 = files[1] if len(files) > 1 else "?"
    user_text = QUALITY_USER_TMPL.format(
        img1_fn=img1,
        img2_fn=img2,
        ref1=refs.get("image1", ""),
        ref2=refs.get("image2", ""),
        ans1=ans1 or "(empty)",
        ans2=ans2 or "(empty)",
    )

    ql: float | None = None
    q_meta: dict[str, Any] = {}
    try:
        from harnessbench.grading.oracle_quality_llm import run_oracle_quality_llm
        from harnessbench.grading.rubric_llm import build_workspace_image_attachment

        rel_paths = [f"image/{n}" for n in files if n]
        user = build_workspace_image_attachment(w, rel_paths, user_text)
        ql, q_meta = run_oracle_quality_llm(system=QUALITY_SYSTEM, user=user)
    except Exception as e:
        q_meta = {"skipped": False, "error": repr(e), "notes": "oracle quality LLM failed"}

    out: dict[str, Any] = {
        "task": "008-image-recognize",
        "workspace": str(w),
        "checks": checks,
        "outcome_score": outcome_score,
        "quality_rubric_meta": q_meta,
    }
    if ql is not None:
        out["quality"] = ql
    return out