Replan After a Late Event Constraint

Round 1 of 2: create the initial staffing plan.

Long-running Autonomy & State AdaptationTask 5Oracle + LLM scoring

Task ID059-event-update-replan

DifficultyMedium

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptLong-running Autonomy & State Adaptation · Task 5

Round1

Round 1 of 2: create the initial staffing plan.

Read $WORKSPACE/in/base_event.json and $WORKSPACE/in/team_availability.json. Create $WORKSPACE/out/original_plan.json.

The plan must schedule four work blocks for the "Atlas Launch Webinar": setup, rehearsal, live_support, and teardown. Use only people who are available in the fixture. Include room, start, end, assigned_people, and assumptions.

Do not create $WORKSPACE/out/revised_plan.json or $WORKSPACE/out/diff.md yet. Do not modify input files.

Round2

Round 2 of 2: revise the plan after a new constraint.

Read the existing $WORKSPACE/out/original_plan.json plus $WORKSPACE/in/update_notice.json. Create:

$WORKSPACE/out/revised_plan.json
$WORKSPACE/out/diff.md

The update notice adds these constraints:

live_support must move to Room B
Chen is unavailable after 13:00
rehearsal must end before 11:00
add a 15-minute accessibility check before live_support

Preserve useful unaffected setup/teardown decisions from the original plan. $WORKSPACE/out/diff.md must list added, removed, and changed items. Do not modify input files.

Input Files3 files

in/base_event.json

in/team_availability.json

in/update_notice.json

Hooks

def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
    (Path(context["workspace"]) / "out").mkdir(parents=True, exist_ok=True)
    return {}


def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
    return runtime_state


def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
    pass

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Evaluate whether the agent made an initial plan, then revised it in response to the late event notice with a clear diff and minimal disruption to unaffected choices.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

--- PROXY TRACE JSON BELOW ---
{payload}"""
)

Completion Grader

def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _minutes(t: str) -> int:
    h, m = t.split(":")
    return int(h) * 60 + int(m)


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _load_json(_GT)
    checks: list[dict[str, Any]] = []

    def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})

    original: dict[str, Any] = {}
    revised: dict[str, Any] = {}
    for name, path, weight in [("original", w / "out" / "original_plan.json", 0.10), ("revised", w / "out" / "revised_plan.json", 0.10)]:
        try:
            data = _load_json(path)
            if name == "original":
                original = data
            else:
                revised = data
            add(f"{name}_parse", f"{name}_plan.json is valid JSON", True, weight)
        except Exception as exc:
            add(f"{name}_parse", f"{name}_plan.json is valid JSON", False, weight, str(exc))

    blocks = {b.get("id"): b for b in revised.get("blocks", []) if isinstance(b, dict)}
    add("required_blocks", "revised plan contains all required blocks", list(blocks) == gt["required_blocks"] or all(b in blocks for b in gt["required_blocks"]), 0.15, list(blocks))
    live = blocks.get("live_support", {})
    add("room_constraint", "live support moved to Room B", live.get("room") == gt["live_support_room"], 0.15, live)
    people = live.get("assigned_people", [])
    add("chen_removed", "Chen is not assigned after 13:00 live support", "Chen" not in people, 0.15, people)
    rehearsal = blocks.get("rehearsal", {})
    try:
        reh_ok = _minutes(str(rehearsal.get("end"))) <= _minutes(gt["rehearsal_latest_end"])
    except Exception:
        reh_ok = False
    add("rehearsal_time", "rehearsal ends before or at 11:00", reh_ok, 0.10, rehearsal)
    acc = blocks.get("accessibility_check", {})
    try:
        acc_ok = _minutes(str(acc.get("end"))) == _minutes(str(live.get("start"))) and _minutes(str(acc.get("end"))) - _minutes(str(acc.get("start"))) == gt["accessibility_duration_minutes"]
    except Exception:
        acc_ok = False
    add("accessibility_check", "15-minute accessibility check immediately precedes live support", acc_ok, 0.15, acc)

    if original and revised:
        unchanged_ok = blocks.get("setup", {}).get("assigned_people") == {b.get("id"): b for b in original.get("blocks", []) if isinstance(b, dict)}.get("setup", {}).get("assigned_people")
        add("preserve_setup", "setup assignment is preserved from original when unaffected", unchanged_ok, 0.05)

    diff_path = w / "out" / "diff.md"
    if diff_path.is_file():
        diff = diff_path.read_text(encoding="utf-8", errors="replace")
        low = diff.lower()
        ok = all(word in low for word in ["added", "changed", "removed"]) and all(token.lower() in low for token in gt["changed_constraints"])
        add("diff_content", "diff.md lists added, removed, changed items and new constraints", ok, 0.15)
    else:
        add("diff_content", "diff.md exists", False, 0.15, "missing")

    total_w = sum(c["weight"] for c in checks)
    score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
    return {"task": "059-event-update-replan", "workspace": str(w), "outcome_score": score, "checks": checks}

← Task 4 Task 6 →