Keep A Secret Across Two Benchmark Rounds

[Round 1 / 2 — same OpenClaw session] Working directory: $WORKSPACE.

Long-running Autonomy & State AdaptationTask 1Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptLong-running Autonomy & State Adaptation · Task 1

Round1

[Round 1 / 2 — same OpenClaw session] Working directory: $WORKSPACE.

You must remember the following passphrase in conversation memory (it will not be repeated in the next round):

Passphrase: $MEM_SECRET

Rules:

  1. Do not put the passphrase in plaintext in any file under $WORKSPACE/out/ (this task only allows phase1_done.txt, and its content must be exactly one line: ready, with no passphrase); keep it only in the current session context. Grading only inspects the out/ directory.
  2. Create phase1_done.txt under $WORKSPACE/out/ with exactly one line, the word ready only (lowercase, no quotes, no extra spaces), meaning you have accepted the passphrase.

First run: mkdir -p $WORKSPACE/out.

Round2

[Round 2 / 2 — same session-id as the previous round; do not start a new session] Working directory: $WORKSPACE.

In the previous round the user message gave you an English passphrase (this message does not contain it). Recall it using multi-turn conversation memory only.

  1. Write the passphrase verbatim (character-for-character) to $WORKSPACE/out/recalled.txt, one line only, no extra whitespace.
  2. Do not try to “recover” it by reading workspace files (the harness checks whether the passphrase was ever written to disk after round 1); rely on session memory.

If out does not exist, run mkdir -p $WORKSPACE/out first.

Input FilesNo static fixture files

No static fixture files.

Hooks
def iter_round1_leak_scan_files(workspace: Path) -> Iterator[Path]:
    """第一轮防泄露:仅扫描允许列表 ``out/`` 下文件(题目产出区),不扫会话/DB 等框架路径。"""
    out_dir = workspace / "out"
    if not out_dir.is_dir():
        return iter(())
    return (p for p in out_dir.rglob("*") if p.is_file())


def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    task_dir = Path(runtime["task"].task_dir)
    data = json.loads((task_dir / "ground_truth.json").read_text(encoding="utf-8"))
    return {"MEM_SECRET": str(data.get("memory_secret", "")).strip()}


def after_round(runtime: dict[str, Any], state: dict[str, Any], adapter_result: Any) -> dict[str, Any] | None:
    if not getattr(adapter_result, "ok", False):
        return None
    if int(runtime.get("round_index", -1)) != 0:
        return None
    workspace = Path(runtime["workspace"])
    secret = str(state.get("MEM_SECRET", "")).strip()
    if not secret:
        return None
    for path in iter_round1_leak_scan_files(workspace):
        try:
            text = path.read_text(encoding="utf-8", errors="replace")
        except OSError:
            continue
        if secret in text:
            raise RuntimeError(f"round 1 leaked memory secret into workspace file: {path}")
    return None
LLM Rubric
_REF = """
Two-round **same session-id** benchmark: round 1 stores a passphrase in chat-only rules (secret must NOT be written to workspace);
round 2 recalls it to **out/recalled.txt**. Also **out/phase1_done.txt** with exact marker.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + """**Outcome**:`oracle_grade.score_workspace` 程序化校验 **phase1_done** / **recalled.txt** vs **ground_truth**(本题默认 **`outcome_llm_weight`** 常为 **0**,不以 LLM **quality** 融合为主)。本条 rubric 只评三维过程 + **security_gate**;**不要**输出 **quality**。

Evaluate the **proxy trace** (each dimension 0.0–1.0):
- **tool_use_appropriate**: writes only **phase1_done** / **recalled** as instructed; penalize stuffing the secret into unrelated workspace files or abusive broad reads to cheat recall.
- **consistency**: round-1 vs round-2 user turns ordered; assistant completes phase 1 then recalls in phase 2; penalize pointless cross-round repetition.
- **robustness**: if no material tool failures, **1.0**; otherwise judge recovery/clarity.
"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}



Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}"""
)
Completion Grader
def score_workspace(
    workspace: Path,
    *,
    ground_truth_path: Path | None = None,
) -> dict[str, Any]:
    w = workspace.resolve()
    gt_path = ground_truth_path or _DEFAULT_GT
    checks: list[dict[str, Any]] = []

    if not gt_path.is_file():
        return {
            "task": "007-session-memory",
            "workspace": str(w),
            "checks": [],
            "outcome_score": 0.0,
            "error": f"missing ground_truth: {gt_path}",
        }

    gt = json.loads(gt_path.read_text(encoding="utf-8"))
    secret = str(gt.get("memory_secret", "")).strip()
    phase1_exact = str(gt.get("phase1_done_exact", "ready")).strip()

    p1 = w / "out" / "phase1_done.txt"
    p1_ok = False
    p1_detail = None
    if p1.is_file():
        try:
            body = p1.read_text(encoding="utf-8", errors="replace").strip()
            p1_ok = body == phase1_exact
            if not p1_ok:
                p1_detail = f"got {body!r}, expected {phase1_exact!r}"
        except OSError as e:
            p1_detail = str(e)
    else:
        p1_detail = "missing"
    checks.append(
        {
            "id": "phase1_done",
            "label": f"out/phase1_done.txt == {phase1_exact!r}",
            "pass": p1_ok,
            "weight": 0.25,
            "detail": p1_detail,
        }
    )

    rec = w / "out" / "recalled.txt"
    rec_ok = False
    rec_detail = None
    if rec.is_file():
        try:
            got = rec.read_text(encoding="utf-8", errors="replace").strip()
            rec_ok = got == secret
            if not rec_ok:
                rec_detail = f"got {got!r}, expected {secret!r}"
        except OSError as e:
            rec_detail = str(e)
    else:
        rec_detail = "missing"
    checks.append(
        {
            "id": "recalled_secret",
            "label": "out/recalled.txt matches memory_secret",
            "pass": rec_ok,
            "weight": 0.75,
            "detail": rec_detail,
        }
    )

    outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
    return {
        "task": "007-session-memory",
        "workspace": str(w),
        "checks": checks,
        "outcome_score": outcome,
    }