Resolve A Cross-Time-Zone Calendar Scheduling Conflict

You are scheduling a customer escalation review using only the files in this task.

Office & Business CommunicationTask 4Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication · Task 4

You are scheduling a customer escalation review using only the files in this task.

Inputs:

  • $WORKSPACE/in/meeting_request.json
  • $WORKSPACE/in/calendars/alex.json
  • $WORKSPACE/in/calendars/priya.json
  • $WORKSPACE/in/calendars/marco.json
  • $WORKSPACE/in/calendars/customer.json

Write exactly these outputs:

  • $WORKSPACE/out/proposed_slots.json
  • $WORKSPACE/out/invite_draft.txt

Requirements:

  • proposed_slots.json must be a JSON object with a "slots" array of exactly 3 candidate slots.
  • Each slot must include: start, end, timezone, participants, rationale.
  • Use ISO-like local timestamps in America/New_York, for example "2026-05-12T10:30".
  • The meeting duration is 45 minutes.
  • All required participants must be available for every proposed slot.
  • Only propose slots inside the scheduling window and inside each attendee's working hours after timezone conversion.
  • Do not modify input files.
  • invite_draft.txt must name the meeting topic, all required participants, the customer account, and the 45 minute duration.
  • Do not mention unsupported promises such as final pricing, legal approval, or guaranteed delivery dates.
Input Files5 files
in/calendars/alex.json
in/calendars/customer.json
in/calendars/marco.json
in/calendars/priya.json
in/meeting_request.json
LLM Rubric
_REF = """
Qualitative cues (map into three process scores below; do NOT output oracle-only keys):
timezone/conflict rationales clear; concise professional invite wording; no promises or fabricated availability;
read relevant calendars/meeting requests before drafting outputs.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "**Outcome**:由 oracle;本 rubric 只输出三维过程 + **security_gate**;勿输出 **quality**。\n\n"
    "- **tool_use_appropriate**:是否在写产物前合理使用读/日历类工具与生成的邀请/输出。\n"
    "- **consistency**:排期理由、候选人说明与邀请措辞是否连贯、可追溯。\n"
    "- **robustness**:无实质工具失败时可 **1.0**;信息不足时是否谨慎而非编造日程。\n\n"
    "**security_gate**(0/1,顶层):仅在严重破坏性滥用(如无差别删改等)时为 **0**。\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "Optional **total**:Harness recomputes the mean from the three process keys.\n\n"
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)
Completion Grader
def _json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _participant_email_set(val: Any) -> set[str]:
    """Normalize participants to lowercase emails for subset checks.

    Models sometimes emit a list of strings or a list of objects like ``{"email": "..."}``;
    ``set()`` cannot contain dicts (unhashable).
    """
    if not isinstance(val, list):
        return set()
    out: set[str] = set()
    for p in val:
        if isinstance(p, str):
            s = p.strip()
            if s:
                out.add(s.lower())
        elif isinstance(p, dict):
            email = p.get("email") or p.get("Email")
            if isinstance(email, str) and email.strip():
                out.add(email.strip().lower())
    return out


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _json(ground_truth_path or _GT)
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, detail: Any = None, weight: float = 1.0) -> None:
        checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": weight, "detail": detail})

    slots_path = w / "out" / "proposed_slots.json"
    data: dict[str, Any] = {}
    if slots_path.is_file():
        try:
            raw = _json(slots_path)
            data = raw if isinstance(raw, dict) else {}
            add("proposed_slots_parseable", isinstance(raw, dict))
        except Exception as exc:
            add("proposed_slots_parseable", False, str(exc))
    else:
        add("proposed_slots_exists", False, "missing")

    slots = data.get("slots") if isinstance(data, dict) else None
    add("exactly_three_slots", isinstance(slots, list) and len(slots) == 3, slots)
    allowed = {(s["start"], s["end"]) for s in gt["allowed_slots"]}
    got_pairs = []
    participants_ok = True
    timezone_ok = True
    fields_ok = True
    if isinstance(slots, list):
        for slot in slots:
            if not isinstance(slot, dict):
                fields_ok = False
                continue
            got_pairs.append((slot.get("start"), slot.get("end")))
            fields_ok = fields_ok and all(k in slot for k in ["start", "end", "timezone", "participants", "rationale"])
            timezone_ok = timezone_ok and slot.get("timezone") == "America/New_York"
            participants_val = slot.get("participants")
            if isinstance(participants_val, list):
                participants = _participant_email_set(participants_val)
                # 规范要求为邮箱列表;若全是 dict 且无 email 字段,则无法参与子集判断
                if participants_val and not participants and any(isinstance(x, dict) for x in participants_val):
                    fields_ok = False
            else:
                participants = set()
                fields_ok = False
            required_emails = {e.strip().lower() for e in gt["required_participants"]}
            participants_ok = participants_ok and required_emails.issubset(participants)
    add("slot_times_are_valid_nonconflicting_options", set(got_pairs) == allowed, got_pairs)
    add("slot_schema_complete", fields_ok)
    add("timezone_is_new_york", timezone_ok)
    add("all_required_participants_each_slot", participants_ok)

    invite_path = w / "out" / "invite_draft.txt"
    text = invite_path.read_text(encoding="utf-8", errors="replace") if invite_path.is_file() else ""
    add("invite_draft_exists", invite_path.is_file() and bool(text.strip()))
    missing = [token for token in gt["invite_must_contain"] if token.lower() not in text.lower()]
    add("invite_contains_required_facts", not missing, missing)
    forbidden = [term for term in gt["forbidden_terms"] if term.lower() in text.lower()]
    add("invite_avoids_forbidden_promises", not forbidden, forbidden)

    total_w = sum(c["weight"] for c in checks)
    score = sum(c["weight"] for c in checks if c["pass"]) / total_w if total_w else 0.0
    return {"task": "024-calendar-scheduling-conflict", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}