Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication · Task 6
Create a structured slide brief for the HelioDesk Q3 product update. Do not create a pptx.
Inputs:
- $WORKSPACE/in/product_notes.md
- $WORKSPACE/in/metrics.csv
- $WORKSPACE/in/template_outline.md
Outputs:
- $WORKSPACE/out/slides_outline.json
- $WORKSPACE/out/speaker_notes.md
slides_outline.json requirements:
- JSON object with a "slides" array of exactly 6 slides.
- Each slide has: slide_number, title, bullets, metric_refs.
- Follow the six sections in template_outline.md in order.
- Cite metric IDs from metrics.csv in metric_refs when a slide uses a metric.
speaker_notes.md requirements:
- Include notes for all 6 slides.
- Cover the main value propositions: faster onboarding, lower ticket volume, enterprise controls, and renewal expansion.
- Do not invent metrics or customer names.
Input Files3 files
in/metrics.csv
in/product_notes.md
in/template_outline.md
LLM Rubric
_REF = """
Cue: narrative flow, concision, source fidelity vs deck script readiness — fold into standard process dimensions.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ "**Outcome**:oracle;本条三维 + **security_gate**;勿 **quality**。\n\n"
"- **tool_use_appropriate / consistency / robustness**:按上方 cue 归入三维;无实质失败可对 **robustness** 给 **1.0**。\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ 'Return ONLY JSON:\n{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, '
'"robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}\n\n'
"--- PROXY TRACE JSON BELOW ---\n"
"{payload}"
)Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = json.loads((ground_truth_path or _GT).read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, detail: Any = None) -> None:
checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": 1.0, "detail": detail})
data: dict[str, Any] = {}
p = w / "out" / "slides_outline.json"
if p.is_file():
try:
raw = json.loads(p.read_text(encoding="utf-8"))
data = raw if isinstance(raw, dict) else {}
add("outline_json_parseable", isinstance(raw, dict))
except Exception as exc:
add("outline_json_parseable", False, str(exc))
else:
add("outline_exists", False, "missing")
slides = data.get("slides")
add("six_slides", isinstance(slides, list) and len(slides) == 6, len(slides) if isinstance(slides, list) else None)
titles = [str(s.get("title", "")) for s in slides] if isinstance(slides, list) else []
add("titles_follow_template_order", titles == gt["titles_in_order"], titles)
schema_ok = isinstance(slides, list) and all(isinstance(s, dict) and all(k in s for k in ["slide_number", "title", "bullets", "metric_refs"]) for s in slides)
add("slide_schema_complete", schema_ok)
refs = set()
if isinstance(slides, list):
for s in slides:
refs.update(str(x) for x in (s.get("metric_refs") or []))
add("all_metrics_referenced", set(gt["metric_ids"]).issubset(refs), sorted(refs))
add("no_unknown_metric_refs", refs.issubset(set(gt["metric_ids"])), sorted(refs - set(gt["metric_ids"])))
notes_path = w / "out" / "speaker_notes.md"
text = notes_path.read_text(encoding="utf-8", errors="replace") if notes_path.is_file() else ""
add("speaker_notes_exists", bool(text.strip()))
add("speaker_notes_cover_all_slides", all(f"Slide {i}" in text or f"slide {i}" in text.lower() for i in range(1, 7)))
missing_terms = [t for t in gt["speaker_note_terms"] if t.lower() not in text.lower()]
add("speaker_notes_cover_value_props", not missing_terms, missing_terms)
forbidden = [t for t in gt["forbidden_terms"] if t.lower() in (json.dumps(data) + text).lower()]
add("forbidden_unapproved_feature_omitted", not forbidden, forbidden)
score = sum(c["pass"] for c in checks) / len(checks) if checks else 0.0
return {"task": "026-ppt-brief-generation", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}