Task 3 — Long-running Autonomy & State Adaptation

Model Runs6 harnesses & 8 models evaluated on this task.

Loading...

PromptLong-running Autonomy & State Adaptation · Task 3

Round1

Round 1 of 2: start the interrupted case review.

Input files are under $WORKSPACE/in:

case_queue.json lists all review items in their required order.
interruption_notice.json tells you where the simulated interruption happens.

Create $WORKSPACE/out/state.json only. Do not create $WORKSPACE/out/final_result.json in this round.

Process exactly the first three queue items, C-101 through C-103. For each processed item, compute:

risk_score = impact * likelihood
action = "escalate" when risk_score >= 12, otherwise "monitor"

$WORKSPACE/out/state.json must be valid JSON with:

schema_version: 1
run_id: "resume-drill-74"
completed_ids: ["C-101", "C-102", "C-103"]
pending_ids for the remaining queue items in order
per_item_results for the three completed items
processing_log with one entry per completed item; each entry must include id, step: "round1", and source_file: "case_queue.json"
interrupted_after_id: "C-103"

Do not modify input files.

Round2

Round 2 of 2: resume from the saved state.

Read the existing $WORKSPACE/out/state.json from round 1, $WORKSPACE/in/case_queue.json, and $WORKSPACE/in/operator_patch.json. Continue from the saved pending_ids; do not recompute or rewrite the already completed C-101 through C-103 work except to carry it forward.

Create or update:

$WORKSPACE/out/state.json
$WORKSPACE/out/final_result.json

$WORKSPACE/out/state.json must now include all five completed ids and an empty pending_ids list. Apply valid operator patches to pending items before scoring. Ignore duplicate patch_id values after the first application and ignore patches for unknown ids. Its processing_log must distinguish reused/skipped preexisting work, applied patches, ignored patches, and newly processed round2 work.

$WORKSPACE/out/final_result.json must be valid JSON with:

run_id: "resume-drill-74"
resumed_from_state: true
total_items: 5
completed_ids in queue order
escalations: ids whose risk_score >= 12
monitor_ids: ids whose risk_score < 12
aggregate_risk_score: sum of all item risk scores
resume_audit with skipped_preexisting_ids and newly_processed_ids
patch_audit with applied_patch_ids, ignored_patch_ids, and ignored_reasons

Also create $WORKSPACE/out/resume_audit.md summarizing preserved history, patch handling, and idempotent resume behavior.

Do not modify input files. Do not create network-dependent output.

Input Files3 files

in/case_queue.json

in/interruption_notice.json

in/operator_patch.json

Hooks

def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(context["workspace"])
    (workspace / "out").mkdir(parents=True, exist_ok=True)
    return {}


def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
    return runtime_state


def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
    pass

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Evaluate the agent run from the proxy trace JSON. Reward careful two-round state handling, reading saved state before resuming, avoiding unnecessary recomputation, clear auditability, and no fixture mutation.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}"""
)

Completion Grader

def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _load_json(_GT)
    checks: list[dict[str, Any]] = []

    def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})

    state_path = w / "out" / "state.json"
    final_path = w / "out" / "final_result.json"
    state: dict[str, Any] = {}
    final: dict[str, Any] = {}

    try:
        state = _load_json(state_path)
        add("state_parse", "out/state.json is parseable", True, 0.10)
    except Exception as exc:
        add("state_parse", "out/state.json is parseable", False, 0.10, str(exc))

    try:
        final = _load_json(final_path)
        add("final_parse", "out/final_result.json is parseable", True, 0.10)
    except Exception as exc:
        add("final_parse", "out/final_result.json is parseable", False, 0.10, str(exc))

    if state:
        add("state_complete", "state records final completion and no pending ids", state.get("completed_ids") == gt["completed_final"] and state.get("pending_ids") == [], 0.15)
        results = state.get("per_item_results", {})
        score_ok = isinstance(results, dict) and all(int(results.get(k, {}).get("risk_score", -1)) == v for k, v in gt["risk_scores"].items())
        add("state_scores", "state keeps correct per-item risk scores", score_ok, 0.15)
        log = state.get("processing_log", [])
        round2_ids = [e.get("id") for e in log if e.get("step") == "round2"]
        round2_pending_ok = set(["C-104", "C-105"]).issubset(set(round2_ids))
        redo_ids = [e.get("id") for e in log if e.get("step") in {"round2", "reprocessed"} and e.get("id") in gt["completed_after_round1"]]
        skip_ids = [
            e.get("id")
            for e in log
            if e.get("step") in {"skipped_preexisting", "skip_preexisting", "round1"}
            and str(e.get("status", "")).lower() in {"skipped_preexisting", "skip_preexisting", "reused_preexisting", "reused"}
        ]
        add("resume_log", "log shows pending work processed in round 2 without reprocessing completed items", round2_pending_ok and not redo_ids, 0.15, {"round2_ids": round2_ids, "redo_ids": redo_ids})
        skip_ids_present = set(skip_ids).issuperset(set(gt["completed_after_round1"]))
        add("skip_audit", "preexisting round 1 items are explicitly skipped or reused", skip_ids_present, 0.10, skip_ids)

    if final:
        ok = (
            final.get("run_id") == "resume-drill-74"
            and final.get("resumed_from_state") is True
            and final.get("completed_ids") == gt["completed_final"]
            and final.get("escalations") == gt["escalations"]
            and final.get("monitor_ids") == gt["monitor_ids"]
            and int(final.get("aggregate_risk_score", -1)) == gt["aggregate_risk_score"]
        )
        add("final_content", "final result has correct resume summary and aggregates", ok, 0.20, final)
        audit = final.get("resume_audit", {})
        add("final_audit", "final audit separates skipped and newly processed ids", audit.get("skipped_preexisting_ids") == gt["completed_after_round1"] and audit.get("newly_processed_ids") == ["C-104", "C-105"], 0.10, audit)
        patch_audit = final.get("patch_audit", {})
        add("patch_audit", "final audit separates applied and ignored patches", patch_audit.get("applied_patch_ids") == gt["applied_patch_ids"] and patch_audit.get("ignored_patch_ids") == gt["ignored_patch_ids"], 0.10, patch_audit)

    audit_text = (w / "out" / "resume_audit.md").read_text(encoding="utf-8", errors="replace").lower() if (w / "out" / "resume_audit.md").is_file() else ""
    add("resume_audit_md", "resume_audit.md explains history preservation and patch handling", all(term.lower() in audit_text for term in gt["audit_terms"]), 0.10)

    fixture_ok = _load_json(_FIXTURE) == {
        "run_id": "resume-drill-74",
        "items": _load_json(_FIXTURE)["items"],
    }
    add("fixture_present", "source fixture remains readable", fixture_ok, 0.05)

    total_w = sum(c["weight"] for c in checks)
    total = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
    return {"task": "057-interruption-resume", "workspace": str(w), "outcome_score": total, "checks": checks}

Interrupt and Resume a Partially Completed Case Review