Extended product launch operations plan

You are the launch operations lead for a small B2B product release.

Vertical Professional WorkflowsTask 1Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptVertical Professional Workflows ยท Task 1

You are the launch operations lead for a small B2B product release.

Read these offline input files:

  • $WORKSPACE/in/product.md
  • $WORKSPACE/in/budget.csv
  • $WORKSPACE/in/audience.json

Create exactly these outputs:

  • $WORKSPACE/out/launch_plan.md
  • $WORKSPACE/out/content_pack.json
  • $WORKSPACE/out/launch_checklist.csv

Requirements for $WORKSPACE/out/launch_plan.md:

  • Include sections named: Objectives, Audience, Budget, Timeline, Dependencies, Compliance, Risks.
  • Mention all three audience segments from audience.json.
  • Include a timeline with these milestone dates: 2026-05-01, 2026-05-10, 2026-05-15, 2026-05-20.
  • Keep total planned spend at or below the approved budget in budget.csv.
  • Include the phrase "offline webinar" and do not promise live product availability before 2026-05-20.
  • Reserve budget for compliance_review before expanding paid social.
  • Do not target unavailable audience segments except to explain why they are excluded.
  • Do not promise the mobile app integration or general availability before the release date.

Requirements for $WORKSPACE/out/content_pack.json:

  • Valid JSON object with keys: tagline, email_subjects, social_posts, webinar_agenda, segment_messages.
  • email_subjects must contain at least 3 strings.
  • social_posts must contain at least 3 strings.
  • webinar_agenda must contain at least 4 agenda items.
  • segment_messages must include one message for each segment id from audience.json.
  • For unavailable segments, segment_messages must mark the segment excluded or not targeted rather than inventing launch copy.

Requirements for $WORKSPACE/out/launch_checklist.csv:

  • Header: item,owner,due_date,dependency,status
  • Include compliance approval, webinar registration page, claims list, sales enablement, and launch day readiness.
  • Dependencies must reflect the offline source constraints.

Do not use the network. Do not modify input files.

Input Files3 files
in/audience.json
in/budget.csv
in/product.md
LLM Rubric
_REF = """
Task: Product launch ops โ€” inputs product.md, budget.csv, audience.json โ†’ outputs launch_plan.md, content_pack.json,
launch_checklist.csv. Offline only; deterministic oracle checks sections, schema, CSV header, phrases, dates, budget caps.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + """**Outcome**๏ผš`oracle_grade.score_workspace` returns `outcome_score` and optional **quality** (workspace deliverable fit).
Harness blends with **outcome_llm_weight** / **HARNESSBENCH_OUTCOME_LLM_WEIGHT**. This rubric outputs **process only**:
three scores + **security_gate** โ€” do **not** output **quality** here.

Judge the trace for:
- **tool_use_appropriate**: reads the three inputs before writing launch_plan.md, content_pack.json, launch_checklist.csv under `out/`; avoids irrelevant tools for an offline workspace task.
- **consistency**: coherent narrative from inputs โ†’ structured plan / JSON / checklist; milestones and segments align with sources.
- **robustness**: if the trace shows no material tool failures needing recovery, score **1.0**; otherwise penalize unmanaged errors.


"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: mean of three process scores โ€” harness recomputes the mean anyway.

--- PROXY TRACE JSON BELOW ---
{payload}"""
)
Completion Grader
def _check(cid: str, label: str, ok: bool, weight: float, detail: str = "") -> dict[str, Any]:
    return {"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": None if ok else detail}


def _json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _json(ground_truth_path or TASK_DIR / "ground_truth.json")
    plan_path = w / "out/launch_plan.md"
    pack_path = w / "out/content_pack.json"
    plan = plan_path.read_text(encoding="utf-8", errors="replace") if plan_path.exists() else ""
    plan_l = plan.lower()
    checks: list[dict[str, Any]] = []

    checks.append(_check("plan_exists", "launch_plan.md exists", plan_path.is_file(), 0.08))
    checks.append(_check("pack_exists", "content_pack.json exists", pack_path.is_file(), 0.08))
    checklist_path = w / "out/launch_checklist.csv"
    checks.append(_check("checklist_exists", "launch_checklist.csv exists", checklist_path.is_file(), 0.06))
    checks.append(_check("sections", "plan contains required sections", all(s.lower() in plan_l for s in gt["required_sections"]), 0.12))
    checks.append(_check("audience", "plan mentions all audience segment names", all(s.lower() in plan_l for s in gt["segment_names"]), 0.12))
    checks.append(_check("timeline", "plan includes all required milestone dates", all(d in plan for d in gt["required_dates"]), 0.12))
    checks.append(_check("offline_webinar", "plan includes offline webinar", gt["required_phrase"] in plan_l, 0.08))
    checks.append(_check("no_forbidden_claims", "plan avoids early availability and unavailable feature claims", plan_path.is_file() and not any(p in plan_l for p in gt["forbidden_early_availability"][:4]) and "promise mobile app" not in plan_l, 0.08))

    amounts = [int(x) for x in re.findall(r"\$?\b([0-9]{3,5})\b", plan.replace(",", ""))]
    over_budget = any(x > gt["approved_total_usd"] for x in amounts)
    checks.append(_check("budget_bound", "no single planned spend exceeds approved total", plan_path.is_file() and amounts and not over_budget, 0.06))
    checks.append(_check("dependency_compliance_terms", "plan covers compliance, dependencies, and excluded segment", all(t in plan_l for t in gt["required_plan_terms"]), 0.10))

    pack_ok = False
    pack_detail = "missing or invalid JSON"
    if pack_path.exists():
        try:
            pack = _json(pack_path)
            keys = {"tagline", "email_subjects", "social_posts", "webinar_agenda", "segment_messages"}
            seg_msg = pack.get("segment_messages", {})
            pack_ok = (
                isinstance(pack, dict)
                and keys.issubset(pack)
                and isinstance(pack.get("email_subjects"), list) and len(pack["email_subjects"]) >= 3
                and isinstance(pack.get("social_posts"), list) and len(pack["social_posts"]) >= 3
                and isinstance(pack.get("webinar_agenda"), list) and len(pack["webinar_agenda"]) >= 4
                and isinstance(seg_msg, dict) and all(seg in seg_msg for seg in gt["segments"])
                and all(seg in seg_msg for seg in gt["available_segments"])
                and "agency_partners" in seg_msg
                and any(word in json.dumps(seg_msg.get("agency_partners", ""), ensure_ascii=False).lower() for word in ["exclude", "not target", "unavailable", "not in this release"])
            )
            pack_detail = "required content_pack keys, counts, or segment messages missing"
        except Exception as exc:
            pack_detail = str(exc)
    checks.append(_check("content_pack_schema", "content_pack.json schema and required counts", pack_ok, 0.20, pack_detail))

    checklist_ok = False
    if checklist_path.exists():
        try:
            with checklist_path.open("r", encoding="utf-8", newline="") as f:
                reader = csv.DictReader(f)
                rows = list(reader)
                header = reader.fieldnames or []
            all_text = json.dumps(rows, ensure_ascii=False).lower()
            checklist_ok = header == ["item", "owner", "due_date", "dependency", "status"] and all(term in all_text for term in gt["required_checklist_terms"])
        except Exception:
            checklist_ok = False
    checks.append(_check("launch_checklist", "launch_checklist.csv captures dependencies", checklist_ok, 0.08))

    total_w = sum(c["weight"] for c in checks)
    score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
    return {"task": "068-product-launch-ops", "workspace": str(w), "outcome_score": score, "checks": checks}