Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication · Task 7
Review the draft contract against company policy.
Inputs:
- $WORKSPACE/in/contract.md
- $WORKSPACE/in/company_policy.md
Outputs:
- $WORKSPACE/out/contract_summary.md
- $WORKSPACE/out/risk_clauses.csv
risk_clauses.csv requirements:
- CSV header: clause_id,risk_type,quote,recommended_action,severity
- Include each clause that conflicts with company_policy.md.
- quote must contain exact text from contract.md.
- severity must be High, Medium, or Low.
contract_summary.md requirements:
- Summarize parties, term, renewal, payment, data/security, liability, and termination.
- Include a short "Policy risks" section.
- Do not provide legal advice beyond recommended business review actions.
- Do not modify input files.
Input Files2 files
in/company_policy.md
in/contract.md
LLM Rubric
_REF = """
Cue: mapping summary accuracy vs clause text, plain-language risk framing, restrained quote use without overclaiming —
fold into standard **tool_use_appropriate** / **consistency** / **robustness** only.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ "**Outcome**:oracle;本条只三维 + **security_gate**;勿 **quality**。\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON:\n"
'{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
'"security_gate": 1, "notes": "one line"}}\n\n'
"--- PROXY TRACE JSON BELOW ---\n"
"{payload}"
)Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = json.loads((ground_truth_path or _GT).read_text(encoding="utf-8"))
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, detail: Any = None) -> None:
checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": 1.0, "detail": detail})
rows: list[dict[str, str]] = []
p = w / "out" / "risk_clauses.csv"
if p.is_file():
try:
rows = list(csv.DictReader(p.open(newline="", encoding="utf-8")))
add("risk_csv_parseable", True)
except Exception as exc:
add("risk_csv_parseable", False, str(exc))
else:
add("risk_csv_exists", False, "missing")
add("risk_csv_header_exact", bool(rows) and list(rows[0].keys()) == ["clause_id", "risk_type", "quote", "recommended_action", "severity"], list(rows[0].keys()) if rows else None)
add("exactly_six_risk_rows", len(rows) == 6, len(rows))
severities_ok = bool(rows) and all(r.get("severity") in {"High", "Medium", "Low"} for r in rows)
add("severity_values_valid", severities_ok)
for exp in gt["expected_risks"]:
matches = [
r for r in rows
if r.get("clause_id") == exp["clause_id"]
and exp["risk_type"].lower() in r.get("risk_type", "").lower()
and exp["quote_contains"].lower() in r.get("quote", "").lower()
and exp["action_contains"].lower() in r.get("recommended_action", "").lower()
]
add(f"risk_{exp['clause_id']}_{exp['risk_type'].replace(' ', '_')}", bool(matches), exp)
s = w / "out" / "contract_summary.md"
text = s.read_text(encoding="utf-8", errors="replace") if s.is_file() else ""
add("summary_exists", bool(text.strip()))
missing = [t for t in gt["summary_terms"] if t.lower() not in text.lower()]
add("summary_covers_key_terms", not missing, missing)
add("summary_has_policy_risks_section", "policy risks" in text.lower())
forbidden = [t for t in gt["forbidden_terms"] if t.lower() in text.lower()]
add("summary_avoids_forbidden_advice", not forbidden, forbidden)
score = sum(c["pass"] for c in checks) / len(checks) if checks else 0.0
return {"task": "027-contract-summary-risk", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}