Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptWorkspace, Tool Use & Multimodal Operations · Task 5
Working directory: $WORKSPACE.
Absolute paths to two images (keep them in the message so vision-capable models can load them):
Image 1: $IMAGE1_ABS_PATH
Image 2: $IMAGE2_ABS_PATH
Tasks:
- Inspect each image and answer from what you see (short English; you may join a few very short phrases on one line with commas—do not need to cover everything, but avoid single-word non-answers).
- Image 1: besides the geometric shape, include main color and background.
- Image 2: besides species/object, include main colors, and at least one of foreground or background cues you can see.
- Write the answer for image 1 to
$WORKSPACE/out/image1_answer.txt: exactly one line. - Write the answer for image 2 to
$WORKSPACE/out/image2_answer.txt: exactly one line. - If missing:
mkdir -p $WORKSPACE/out.
Input Files2 files
image/target1.png
image/target2.jpg
Hooks
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
workspace = Path(runtime["workspace"])
task_dir = Path(runtime["task"].task_dir)
data = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
files = list(data.get("image_files") or [])
state: dict[str, Any] = {}
for index, name in enumerate(files, start=1):
state[f"IMAGE{index}_ABS_PATH"] = str((workspace / "image" / name).resolve())
return stateLLM Rubric
USER_TEMPLATE = """This task: Working directory: `$WORKSPACE`. Absolute paths to two images (keep them in the message so vision-capable models can load them): Image 1: `$IMAGE1_ABS_PATH` Image 2: `$IMAGE2_ABS_PATH` Tasks: 1. **Inspect each image** an...
Key outputs: out, out/image1_answer.txt, out/image2_answer.txt.
Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt_path = ground_truth_path or (TASK_DIR / "ground_truth.json")
gt = json.loads(gt_path.read_text(encoding="utf-8"))
files = list(gt.get("image_files") or ["target1.png", "target2.jpg"])
refs = gt.get("rubric_reference") or {}
checks = []
for rel in ("out/image1_answer.txt", "out/image2_answer.txt"):
path = w / rel
body = ""
if path.is_file():
body = path.read_text(encoding="utf-8", errors="replace").strip()
checks.append(
{
"id": rel.replace("/", "_"),
"label": f"{rel} exists and is non-empty",
"pass": bool(body),
"weight": 0.5,
"detail": None if body else "missing or empty",
}
)
outcome_score = round(sum(c["weight"] for c in checks if c["pass"]), 4)
a1_path, a2_path = w / "out/image1_answer.txt", w / "out/image2_answer.txt"
ans1 = a1_path.read_text(encoding="utf-8", errors="replace").strip() if a1_path.is_file() else ""
ans2 = a2_path.read_text(encoding="utf-8", errors="replace").strip() if a2_path.is_file() else ""
img1 = files[0] if len(files) > 0 else "?"
img2 = files[1] if len(files) > 1 else "?"
user_text = QUALITY_USER_TMPL.format(
img1_fn=img1,
img2_fn=img2,
ref1=refs.get("image1", ""),
ref2=refs.get("image2", ""),
ans1=ans1 or "(empty)",
ans2=ans2 or "(empty)",
)
ql: float | None = None
q_meta: dict[str, Any] = {}
try:
from harnessbench.grading.oracle_quality_llm import run_oracle_quality_llm
from harnessbench.grading.rubric_llm import build_workspace_image_attachment
rel_paths = [f"image/{n}" for n in files if n]
user = build_workspace_image_attachment(w, rel_paths, user_text)
ql, q_meta = run_oracle_quality_llm(system=QUALITY_SYSTEM, user=user)
except Exception as e:
q_meta = {"skipped": False, "error": repr(e), "notes": "oracle quality LLM failed"}
out: dict[str, Any] = {
"task": "008-image-recognize",
"workspace": str(w),
"checks": checks,
"outcome_score": outcome_score,
"quality_rubric_meta": q_meta,
}
if ql is not None:
out["quality"] = ql
return out