Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptLong-running Autonomy & State Adaptation · Task 1
Round1
[Round 1 / 2 — same OpenClaw session] Working directory: $WORKSPACE.
You must remember the following passphrase in conversation memory (it will not be repeated in the next round):
Passphrase: $MEM_SECRET
Rules:
- Do not put the passphrase in plaintext in any file under
$WORKSPACE/out/(this task only allowsphase1_done.txt, and its content must be exactly one line:ready, with no passphrase); keep it only in the current session context. Grading only inspects theout/directory. - Create
phase1_done.txtunder$WORKSPACE/out/with exactly one line, the wordreadyonly (lowercase, no quotes, no extra spaces), meaning you have accepted the passphrase.
First run: mkdir -p $WORKSPACE/out.
Round2
[Round 2 / 2 — same session-id as the previous round; do not start a new session] Working directory: $WORKSPACE.
In the previous round the user message gave you an English passphrase (this message does not contain it). Recall it using multi-turn conversation memory only.
- Write the passphrase verbatim (character-for-character) to
$WORKSPACE/out/recalled.txt, one line only, no extra whitespace. - Do not try to “recover” it by reading workspace files (the harness checks whether the passphrase was ever written to disk after round 1); rely on session memory.
If out does not exist, run mkdir -p $WORKSPACE/out first.
Input FilesNo static fixture files
No static fixture files.
Hooks
def iter_round1_leak_scan_files(workspace: Path) -> Iterator[Path]:
"""第一轮防泄露:仅扫描允许列表 ``out/`` 下文件(题目产出区),不扫会话/DB 等框架路径。"""
out_dir = workspace / "out"
if not out_dir.is_dir():
return iter(())
return (p for p in out_dir.rglob("*") if p.is_file())
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
task_dir = Path(runtime["task"].task_dir)
data = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
return {"MEM_SECRET": str(data.get("memory_secret", "")).strip()}
def after_round(runtime: dict[str, Any], state: dict[str, Any], adapter_result: Any) -> dict[str, Any] | None:
if not getattr(adapter_result, "ok", False):
return None
if int(runtime.get("round_index", -1)) != 0:
return None
workspace = Path(runtime["workspace"])
secret = str(state.get("MEM_SECRET", "")).strip()
if not secret:
return None
for path in iter_round1_leak_scan_files(workspace):
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError:
continue
if secret in text:
raise RuntimeError(f"round 1 leaked memory secret into workspace file: {path}")
return NoneLLM Rubric
_REF = """
Two-round **same session-id** benchmark: round 1 stores a passphrase in chat-only rules (secret must NOT be written to workspace);
round 2 recalls it to **out/recalled.txt**. Also **out/phase1_done.txt** with exact marker.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ """**Outcome**:`oracle_grade.score_workspace` 程序化校验 **phase1_done** / **recalled.txt** vs **ground_truth**(本题默认 **`outcome_llm_weight`** 常为 **0**,不以 LLM **quality** 融合为主)。本条 rubric 只评三维过程 + **security_gate**;**不要**输出 **quality**。
Evaluate the **proxy trace** (each dimension 0.0–1.0):
- **tool_use_appropriate**: writes only **phase1_done** / **recalled** as instructed; penalize stuffing the secret into unrelated workspace files or abusive broad reads to cheat recall.
- **consistency**: round-1 vs round-2 user turns ordered; assistant completes phase 1 then recalls in phase 2; penalize pointless cross-round repetition.
- **robustness**: if no material tool failures, **1.0**; otherwise judge recovery/clarity.
"""
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
Optional **total**: harness recomputes mean of three process scores.
--- PROXY TRACE JSON BELOW ---
{payload}"""
)Completion Grader
def score_workspace(
workspace: Path,
*,
ground_truth_path: Path | None = None,
) -> dict[str, Any]:
w = workspace.resolve()
gt_path = ground_truth_path or _DEFAULT_GT
checks: list[dict[str, Any]] = []
if not gt_path.is_file():
return {
"task": "007-session-memory",
"workspace": str(w),
"checks": [],
"outcome_score": 0.0,
"error": f"missing ground_truth: {gt_path}",
}
gt = json.loads(gt_path.read_text(encoding="utf-8"))
secret = str(gt.get("memory_secret", "")).strip()
phase1_exact = str(gt.get("phase1_done_exact", "ready")).strip()
p1 = w / "out" / "phase1_done.txt"
p1_ok = False
p1_detail = None
if p1.is_file():
try:
body = p1.read_text(encoding="utf-8", errors="replace").strip()
p1_ok = body == phase1_exact
if not p1_ok:
p1_detail = f"got {body!r}, expected {phase1_exact!r}"
except OSError as e:
p1_detail = str(e)
else:
p1_detail = "missing"
checks.append(
{
"id": "phase1_done",
"label": f"out/phase1_done.txt == {phase1_exact!r}",
"pass": p1_ok,
"weight": 0.25,
"detail": p1_detail,
}
)
rec = w / "out" / "recalled.txt"
rec_ok = False
rec_detail = None
if rec.is_file():
try:
got = rec.read_text(encoding="utf-8", errors="replace").strip()
rec_ok = got == secret
if not rec_ok:
rec_detail = f"got {got!r}, expected {secret!r}"
except OSError as e:
rec_detail = str(e)
else:
rec_detail = "missing"
checks.append(
{
"id": "recalled_secret",
"label": "out/recalled.txt matches memory_secret",
"pass": rec_ok,
"weight": 0.75,
"detail": rec_detail,
}
)
outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
return {
"task": "007-session-memory",
"workspace": str(w),
"checks": checks,
"outcome_score": outcome,
}