Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptVertical Professional Workflows ยท Task 1
You are the launch operations lead for a small B2B product release.
Read these offline input files:
- $WORKSPACE/in/product.md
- $WORKSPACE/in/budget.csv
- $WORKSPACE/in/audience.json
Create exactly these outputs:
- $WORKSPACE/out/launch_plan.md
- $WORKSPACE/out/content_pack.json
- $WORKSPACE/out/launch_checklist.csv
Requirements for $WORKSPACE/out/launch_plan.md:
- Include sections named: Objectives, Audience, Budget, Timeline, Dependencies, Compliance, Risks.
- Mention all three audience segments from audience.json.
- Include a timeline with these milestone dates: 2026-05-01, 2026-05-10, 2026-05-15, 2026-05-20.
- Keep total planned spend at or below the approved budget in budget.csv.
- Include the phrase "offline webinar" and do not promise live product availability before 2026-05-20.
- Reserve budget for compliance_review before expanding paid social.
- Do not target unavailable audience segments except to explain why they are excluded.
- Do not promise the mobile app integration or general availability before the release date.
Requirements for $WORKSPACE/out/content_pack.json:
- Valid JSON object with keys: tagline, email_subjects, social_posts, webinar_agenda, segment_messages.
- email_subjects must contain at least 3 strings.
- social_posts must contain at least 3 strings.
- webinar_agenda must contain at least 4 agenda items.
- segment_messages must include one message for each segment id from audience.json.
- For unavailable segments, segment_messages must mark the segment excluded or not targeted rather than inventing launch copy.
Requirements for $WORKSPACE/out/launch_checklist.csv:
- Header: item,owner,due_date,dependency,status
- Include compliance approval, webinar registration page, claims list, sales enablement, and launch day readiness.
- Dependencies must reflect the offline source constraints.
Do not use the network. Do not modify input files.
Input Files3 files
in/audience.json
in/budget.csv
in/product.md
LLM Rubric
_REF = """
Task: Product launch ops โ inputs product.md, budget.csv, audience.json โ outputs launch_plan.md, content_pack.json,
launch_checklist.csv. Offline only; deterministic oracle checks sections, schema, CSV header, phrases, dates, budget caps.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ """**Outcome**๏ผ`oracle_grade.score_workspace` returns `outcome_score` and optional **quality** (workspace deliverable fit).
Harness blends with **outcome_llm_weight** / **HARNESSBENCH_OUTCOME_LLM_WEIGHT**. This rubric outputs **process only**:
three scores + **security_gate** โ do **not** output **quality** here.
Judge the trace for:
- **tool_use_appropriate**: reads the three inputs before writing launch_plan.md, content_pack.json, launch_checklist.csv under `out/`; avoids irrelevant tools for an offline workspace task.
- **consistency**: coherent narrative from inputs โ structured plan / JSON / checklist; milestones and segments align with sources.
- **robustness**: if the trace shows no material tool failures needing recovery, score **1.0**; otherwise penalize unmanaged errors.
"""
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
Optional **total**: mean of three process scores โ harness recomputes the mean anyway.
--- PROXY TRACE JSON BELOW ---
{payload}"""
)Completion Grader
def _check(cid: str, label: str, ok: bool, weight: float, detail: str = "") -> dict[str, Any]:
return {"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": None if ok else detail}
def _json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = _json(ground_truth_path or TASK_DIR / "ground_truth.json")
plan_path = w / "out/launch_plan.md"
pack_path = w / "out/content_pack.json"
plan = plan_path.read_text(encoding="utf-8", errors="replace") if plan_path.exists() else ""
plan_l = plan.lower()
checks: list[dict[str, Any]] = []
checks.append(_check("plan_exists", "launch_plan.md exists", plan_path.is_file(), 0.08))
checks.append(_check("pack_exists", "content_pack.json exists", pack_path.is_file(), 0.08))
checklist_path = w / "out/launch_checklist.csv"
checks.append(_check("checklist_exists", "launch_checklist.csv exists", checklist_path.is_file(), 0.06))
checks.append(_check("sections", "plan contains required sections", all(s.lower() in plan_l for s in gt["required_sections"]), 0.12))
checks.append(_check("audience", "plan mentions all audience segment names", all(s.lower() in plan_l for s in gt["segment_names"]), 0.12))
checks.append(_check("timeline", "plan includes all required milestone dates", all(d in plan for d in gt["required_dates"]), 0.12))
checks.append(_check("offline_webinar", "plan includes offline webinar", gt["required_phrase"] in plan_l, 0.08))
checks.append(_check("no_forbidden_claims", "plan avoids early availability and unavailable feature claims", plan_path.is_file() and not any(p in plan_l for p in gt["forbidden_early_availability"][:4]) and "promise mobile app" not in plan_l, 0.08))
amounts = [int(x) for x in re.findall(r"\$?\b([0-9]{3,5})\b", plan.replace(",", ""))]
over_budget = any(x > gt["approved_total_usd"] for x in amounts)
checks.append(_check("budget_bound", "no single planned spend exceeds approved total", plan_path.is_file() and amounts and not over_budget, 0.06))
checks.append(_check("dependency_compliance_terms", "plan covers compliance, dependencies, and excluded segment", all(t in plan_l for t in gt["required_plan_terms"]), 0.10))
pack_ok = False
pack_detail = "missing or invalid JSON"
if pack_path.exists():
try:
pack = _json(pack_path)
keys = {"tagline", "email_subjects", "social_posts", "webinar_agenda", "segment_messages"}
seg_msg = pack.get("segment_messages", {})
pack_ok = (
isinstance(pack, dict)
and keys.issubset(pack)
and isinstance(pack.get("email_subjects"), list) and len(pack["email_subjects"]) >= 3
and isinstance(pack.get("social_posts"), list) and len(pack["social_posts"]) >= 3
and isinstance(pack.get("webinar_agenda"), list) and len(pack["webinar_agenda"]) >= 4
and isinstance(seg_msg, dict) and all(seg in seg_msg for seg in gt["segments"])
and all(seg in seg_msg for seg in gt["available_segments"])
and "agency_partners" in seg_msg
and any(word in json.dumps(seg_msg.get("agency_partners", ""), ensure_ascii=False).lower() for word in ["exclude", "not target", "unavailable", "not in this release"])
)
pack_detail = "required content_pack keys, counts, or segment messages missing"
except Exception as exc:
pack_detail = str(exc)
checks.append(_check("content_pack_schema", "content_pack.json schema and required counts", pack_ok, 0.20, pack_detail))
checklist_ok = False
if checklist_path.exists():
try:
with checklist_path.open("r", encoding="utf-8", newline="") as f:
reader = csv.DictReader(f)
rows = list(reader)
header = reader.fieldnames or []
all_text = json.dumps(rows, ensure_ascii=False).lower()
checklist_ok = header == ["item", "owner", "due_date", "dependency", "status"] and all(term in all_text for term in gt["required_checklist_terms"])
except Exception:
checklist_ok = False
checks.append(_check("launch_checklist", "launch_checklist.csv captures dependencies", checklist_ok, 0.08))
total_w = sum(c["weight"] for c in checks)
score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
return {"task": "068-product-launch-ops", "workspace": str(w), "outcome_score": score, "checks": checks}