Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication · Task 10
Revise the Markdown draft using the style guide and legal comments.
Inputs:
- $WORKSPACE/in/draft.md
- $WORKSPACE/in/style_guide.md
- $WORKSPACE/in/legal_comments.md
Outputs:
- $WORKSPACE/out/revised_draft.md
- $WORKSPACE/out/change_log.csv
revised_draft.md requirements:
- Preserve the "Data Processing Addendum" section and the "No Warranty Expansion" clause.
- Replace banned hype language with factual wording.
- Add the required legal disclaimer in the pilot description.
- Keep Markdown headings.
change_log.csv requirements:
- CSV header: section,change_type,source_comment,summary
- Map each required edit to either style_guide or legal_comments.
- Do not modify input files.
Input Files3 files
in/draft.md
in/legal_comments.md
in/style_guide.md
LLM Rubric
_REF = """
Cue: revision clarity, clause alignment vs policy, changelog usefulness vs restraint/map into three Harness process keys only.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ "**Outcome**:oracle;本条三维 + **security_gate**。\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON:\n"
'{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
'"security_gate": 1, "notes": "one line"}}\n\n'
"--- PROXY TRACE JSON BELOW ---\n"
"{payload}"
)Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = json.loads((ground_truth_path or _GT).read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, detail: Any = None) -> None:
checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": 1.0, "detail": detail})
rp = w / "out" / "revised_draft.md"
text = rp.read_text(encoding="utf-8", errors="replace") if rp.is_file() else ""
add("revised_draft_exists", bool(text.strip()))
missing = [x for x in gt["must_contain"] if x not in text]
add("required_text_preserved_or_added", not missing, missing)
forbidden = [x for x in gt["forbidden_terms"] if x.lower() in text.lower()]
add("banned_hype_removed", not forbidden, forbidden)
add("markdown_headings_preserved", all(h in text for h in ["## Overview", "## Pilot Scope", "## Data Processing Addendum", "## No Warranty Expansion", "## Call To Action"]))
rows: list[dict[str, str]] = []
cp = w / "out" / "change_log.csv"
if cp.is_file():
try:
rows = list(csv.DictReader(cp.open(newline="", encoding="utf-8")))
add("change_log_parseable", True)
except Exception as exc:
add("change_log_parseable", False, str(exc))
else:
add("change_log_exists", False, "missing")
add("change_log_header_exact", bool(rows) and list(rows[0].keys()) == ["section", "change_type", "source_comment", "summary"], list(rows[0].keys()) if rows else None)
blob = json.dumps(rows).lower()
add("change_log_has_required_sources", all(src in blob for src in gt["change_log_sources"]), blob)
add("change_log_maps_legal_comments", all(c.lower() in blob for c in gt["required_comments"]), blob)
score = sum(c["pass"] for c in checks) / len(checks) if checks else 0.0
return {"task": "030-word-revision-plan", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}