Model Runs
6 harnesses & 8 models evaluated on this task.
Loading...
Prompt
Workspace, Tool Use & Multimodal Operations ยท Task 1
The working directory is $WORKSPACE.
Read in/input.txt, count the number of lines in the file, and write the result to out/linecount.txt.
Requirements:
- Write only a single integer (the line count).
- Do not modify
in/input.txt.
Input Files
1 file
in/input.txt
LLM Rubric
USER_TEMPLATE = """This task: read in/input.txt, count its lines, and write
only the integer count to out/linecount.txt. Oracle checks the final file
content exactly.
Evaluate the agent run:
- tool_use_appropriate: uses file-reading and file-writing operations
appropriate for a simple workspace counting task.
- consistency: read input -> count lines -> write the required output file.
- robustness: handles missing directories or output creation cleanly.
Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def score_workspace(workspace: Path) -> dict[str, Any]:
target = workspace / "out" / "linecount.txt"
value = ""
if target.is_file():
value = target.read_text(encoding="utf-8", errors="replace").strip()
ok = value == "4"
return {
"task": "001-file",
"workspace": str(workspace),
"checks": [
{
"id": "linecount",
"label": "out/linecount.txt == 4",
"pass": ok,
"weight": 1.0,
"detail": None if ok else f"got {value!r}",
}
],
"outcome_score": 1.0 if ok else 0.0,
}