Task 4 — Office & Business Communication

Model Runs6 harnesses & 8 models evaluated on this task.

Loading...

PromptOffice & Business Communication · Task 4

You are scheduling a customer escalation review using only the files in this task.

Inputs:

$WORKSPACE/in/meeting_request.json
$WORKSPACE/in/calendars/alex.json
$WORKSPACE/in/calendars/priya.json
$WORKSPACE/in/calendars/marco.json
$WORKSPACE/in/calendars/customer.json

Write exactly these outputs:

$WORKSPACE/out/proposed_slots.json
$WORKSPACE/out/invite_draft.txt

Requirements:

proposed_slots.json must be a JSON object with a "slots" array of exactly 3 candidate slots.
Each slot must include: start, end, timezone, participants, rationale.
Use ISO-like local timestamps in America/New_York, for example "2026-05-12T10:30".
The meeting duration is 45 minutes.
All required participants must be available for every proposed slot.
Only propose slots inside the scheduling window and inside each attendee's working hours after timezone conversion.
Do not modify input files.
invite_draft.txt must name the meeting topic, all required participants, the customer account, and the 45 minute duration.
Do not mention unsupported promises such as final pricing, legal approval, or guaranteed delivery dates.

Input Files5 files

in/calendars/alex.json

in/calendars/customer.json

in/calendars/marco.json

in/calendars/priya.json

in/meeting_request.json

LLM Rubric

_REF = """
Qualitative cues (map into three process scores below; do NOT output oracle-only keys):
timezone/conflict rationales clear; concise professional invite wording; no promises or fabricated availability;
read relevant calendars/meeting requests before drafting outputs.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "**Outcome**：由 oracle；本 rubric 只输出三维过程 + **security_gate**；勿输出 **quality**。\n\n"
    "- **tool_use_appropriate**：是否在写产物前合理使用读/日历类工具与生成的邀请/输出。\n"
    "- **consistency**：排期理由、候选人说明与邀请措辞是否连贯、可追溯。\n"
    "- **robustness**：无实质工具失败时可 **1.0**；信息不足时是否谨慎而非编造日程。\n\n"
    "**security_gate**（0/1，顶层）：仅在严重破坏性滥用（如无差别删改等）时为 **0**。\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "Optional **total**：Harness recomputes the mean from the three process keys.\n\n"
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _participant_email_set(val: Any) -> set[str]:
    """Normalize participants to lowercase emails for subset checks.

    Models sometimes emit a list of strings or a list of objects like ``{"email": "..."}``;
    ``set()`` cannot contain dicts (unhashable).
    """
    if not isinstance(val, list):
        return set()
    out: set[str] = set()
    for p in val:
        if isinstance(p, str):
            s = p.strip()
            if s:
                out.add(s.lower())
        elif isinstance(p, dict):
            email = p.get("email") or p.get("Email")
            if isinstance(email, str) and email.strip():
                out.add(email.strip().lower())
    return out


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _json(ground_truth_path or _GT)
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, detail: Any = None, weight: float = 1.0) -> None:
        checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": weight, "detail": detail})

    slots_path = w / "out" / "proposed_slots.json"
    data: dict[str, Any] = {}
    if slots_path.is_file():
        try:
            raw = _json(slots_path)
            data = raw if isinstance(raw, dict) else {}
            add("proposed_slots_parseable", isinstance(raw, dict))
        except Exception as exc:
            add("proposed_slots_parseable", False, str(exc))
    else:
        add("proposed_slots_exists", False, "missing")

    slots = data.get("slots") if isinstance(data, dict) else None
    add("exactly_three_slots", isinstance(slots, list) and len(slots) == 3, slots)
    allowed = {(s["start"], s["end"]) for s in gt["allowed_slots"]}
    got_pairs = []
    participants_ok = True
    timezone_ok = True
    fields_ok = True
    if isinstance(slots, list):
        for slot in slots:
            if not isinstance(slot, dict):
                fields_ok = False
                continue
            got_pairs.append((slot.get("start"), slot.get("end")))
            fields_ok = fields_ok and all(k in slot for k in ["start", "end", "timezone", "participants", "rationale"])
            timezone_ok = timezone_ok and slot.get("timezone") == "America/New_York"
            participants_val = slot.get("participants")
            if isinstance(participants_val, list):
                participants = _participant_email_set(participants_val)
                # 规范要求为邮箱列表；若全是 dict 且无 email 字段，则无法参与子集判断
                if participants_val and not participants and any(isinstance(x, dict) for x in participants_val):
                    fields_ok = False
            else:
                participants = set()
                fields_ok = False
            required_emails = {e.strip().lower() for e in gt["required_participants"]}
            participants_ok = participants_ok and required_emails.issubset(participants)
    add("slot_times_are_valid_nonconflicting_options", set(got_pairs) == allowed, got_pairs)
    add("slot_schema_complete", fields_ok)
    add("timezone_is_new_york", timezone_ok)
    add("all_required_participants_each_slot", participants_ok)

    invite_path = w / "out" / "invite_draft.txt"
    text = invite_path.read_text(encoding="utf-8", errors="replace") if invite_path.is_file() else ""
    add("invite_draft_exists", invite_path.is_file() and bool(text.strip()))
    missing = [token for token in gt["invite_must_contain"] if token.lower() not in text.lower()]
    add("invite_contains_required_facts", not missing, missing)
    forbidden = [term for term in gt["forbidden_terms"] if term.lower() in text.lower()]
    add("invite_avoids_forbidden_promises", not forbidden, forbidden)

    total_w = sum(c["weight"] for c in checks)
    score = sum(c["weight"] for c in checks if c["pass"]) / total_w if total_w else 0.0
    return {"task": "024-calendar-scheduling-conflict", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}

Resolve A Cross-Time-Zone Calendar Scheduling Conflict