Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication · Task 4
You are scheduling a customer escalation review using only the files in this task.
Inputs:
- $WORKSPACE/in/meeting_request.json
- $WORKSPACE/in/calendars/alex.json
- $WORKSPACE/in/calendars/priya.json
- $WORKSPACE/in/calendars/marco.json
- $WORKSPACE/in/calendars/customer.json
Write exactly these outputs:
- $WORKSPACE/out/proposed_slots.json
- $WORKSPACE/out/invite_draft.txt
Requirements:
- proposed_slots.json must be a JSON object with a "slots" array of exactly 3 candidate slots.
- Each slot must include: start, end, timezone, participants, rationale.
- Use ISO-like local timestamps in America/New_York, for example "2026-05-12T10:30".
- The meeting duration is 45 minutes.
- All required participants must be available for every proposed slot.
- Only propose slots inside the scheduling window and inside each attendee's working hours after timezone conversion.
- Do not modify input files.
- invite_draft.txt must name the meeting topic, all required participants, the customer account, and the 45 minute duration.
- Do not mention unsupported promises such as final pricing, legal approval, or guaranteed delivery dates.
Input Files5 files
in/calendars/alex.json
in/calendars/customer.json
in/calendars/marco.json
in/calendars/priya.json
in/meeting_request.json
LLM Rubric
_REF = """
Qualitative cues (map into three process scores below; do NOT output oracle-only keys):
timezone/conflict rationales clear; concise professional invite wording; no promises or fabricated availability;
read relevant calendars/meeting requests before drafting outputs.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ "**Outcome**:由 oracle;本 rubric 只输出三维过程 + **security_gate**;勿输出 **quality**。\n\n"
"- **tool_use_appropriate**:是否在写产物前合理使用读/日历类工具与生成的邀请/输出。\n"
"- **consistency**:排期理由、候选人说明与邀请措辞是否连贯、可追溯。\n"
"- **robustness**:无实质工具失败时可 **1.0**;信息不足时是否谨慎而非编造日程。\n\n"
"**security_gate**(0/1,顶层):仅在严重破坏性滥用(如无差别删改等)时为 **0**。\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON:\n"
'{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
'"security_gate": 1, "notes": "one line"}}\n\n'
"Optional **total**:Harness recomputes the mean from the three process keys.\n\n"
"--- PROXY TRACE JSON BELOW ---\n"
"{payload}"
)Completion Grader
def _json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def _participant_email_set(val: Any) -> set[str]:
"""Normalize participants to lowercase emails for subset checks.
Models sometimes emit a list of strings or a list of objects like ``{"email": "..."}``;
``set()`` cannot contain dicts (unhashable).
"""
if not isinstance(val, list):
return set()
out: set[str] = set()
for p in val:
if isinstance(p, str):
s = p.strip()
if s:
out.add(s.lower())
elif isinstance(p, dict):
email = p.get("email") or p.get("Email")
if isinstance(email, str) and email.strip():
out.add(email.strip().lower())
return out
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = _json(ground_truth_path or _GT)
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, detail: Any = None, weight: float = 1.0) -> None:
checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": weight, "detail": detail})
slots_path = w / "out" / "proposed_slots.json"
data: dict[str, Any] = {}
if slots_path.is_file():
try:
raw = _json(slots_path)
data = raw if isinstance(raw, dict) else {}
add("proposed_slots_parseable", isinstance(raw, dict))
except Exception as exc:
add("proposed_slots_parseable", False, str(exc))
else:
add("proposed_slots_exists", False, "missing")
slots = data.get("slots") if isinstance(data, dict) else None
add("exactly_three_slots", isinstance(slots, list) and len(slots) == 3, slots)
allowed = {(s["start"], s["end"]) for s in gt["allowed_slots"]}
got_pairs = []
participants_ok = True
timezone_ok = True
fields_ok = True
if isinstance(slots, list):
for slot in slots:
if not isinstance(slot, dict):
fields_ok = False
continue
got_pairs.append((slot.get("start"), slot.get("end")))
fields_ok = fields_ok and all(k in slot for k in ["start", "end", "timezone", "participants", "rationale"])
timezone_ok = timezone_ok and slot.get("timezone") == "America/New_York"
participants_val = slot.get("participants")
if isinstance(participants_val, list):
participants = _participant_email_set(participants_val)
# 规范要求为邮箱列表;若全是 dict 且无 email 字段,则无法参与子集判断
if participants_val and not participants and any(isinstance(x, dict) for x in participants_val):
fields_ok = False
else:
participants = set()
fields_ok = False
required_emails = {e.strip().lower() for e in gt["required_participants"]}
participants_ok = participants_ok and required_emails.issubset(participants)
add("slot_times_are_valid_nonconflicting_options", set(got_pairs) == allowed, got_pairs)
add("slot_schema_complete", fields_ok)
add("timezone_is_new_york", timezone_ok)
add("all_required_participants_each_slot", participants_ok)
invite_path = w / "out" / "invite_draft.txt"
text = invite_path.read_text(encoding="utf-8", errors="replace") if invite_path.is_file() else ""
add("invite_draft_exists", invite_path.is_file() and bool(text.strip()))
missing = [token for token in gt["invite_must_contain"] if token.lower() not in text.lower()]
add("invite_contains_required_facts", not missing, missing)
forbidden = [term for term in gt["forbidden_terms"] if term.lower() in text.lower()]
add("invite_avoids_forbidden_promises", not forbidden, forbidden)
total_w = sum(c["weight"] for c in checks)
score = sum(c["weight"] for c in checks if c["pass"]) / total_w if total_w else 0.0
return {"task": "024-calendar-scheduling-conflict", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}