Task 9 — Office & Business Communication

Model Runs6 harnesses & 8 models evaluated on this task.

Loading...

PromptOffice & Business Communication · Task 9

Review the reimbursement packet for policy issues and missing documentation.

Inputs:

$WORKSPACE/in/receipts.csv
$WORKSPACE/in/policy.md
$WORKSPACE/in/receipt_texts/*.txt

Outputs:

$WORKSPACE/out/reimbursement_audit.csv
$WORKSPACE/out/missing_docs.md

reimbursement_audit.csv requirements:

CSV header: receipt_id,employee,amount,category,issue,allowed_amount,recommended_action
Include every receipt with a policy issue, duplicate, or missing/invalid attachment.
Use numeric amounts without currency symbols.

missing_docs.md requirements:

List every receipt that lacks a usable attachment and the reason.
Include the total claimed amount and total allowed amount after policy caps.
Do not modify input files.

Input Files7 files

in/policy.md

in/receipt_texts/r001.txt

in/receipt_texts/r002.txt

in/receipt_texts/r003.txt

in/receipt_texts/r004.txt

in/receipt_texts/r006.txt

in/receipts.csv

LLM Rubric

_REF = """
Cue: policy-aligned reasoning for approvals/denials, audit usefulness vs evidence grounding, stakeholder-safe wording —
map into standard three process axes only (no bespoke score keys).
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "**Outcome**：oracle；本条三维 + **security_gate**。\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _money_eq(a: str, b: str) -> bool:
    try:
        return abs(float(str(a).replace("$", "")) - float(b)) < 0.01
    except Exception:
        return False


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = json.loads((ground_truth_path or _GT).read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, detail: Any = None) -> None:
        checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": 1.0, "detail": detail})

    rows: list[dict[str, str]] = []
    p = w / "out" / "reimbursement_audit.csv"
    if p.is_file():
        try:
            rows = list(csv.DictReader(p.open(newline="", encoding="utf-8")))
            add("audit_csv_parseable", True)
        except Exception as exc:
            add("audit_csv_parseable", False, str(exc))
    else:
        add("audit_csv_exists", False, "missing")
    add("audit_header_exact", bool(rows) and list(rows[0].keys()) == ["receipt_id", "employee", "amount", "category", "issue", "allowed_amount", "recommended_action"], list(rows[0].keys()) if rows else None)
    add("exactly_five_issue_rows", len(rows) == 5, len(rows))
    for exp in gt["expected_issues"]:
        hit = [r for r in rows if r.get("receipt_id") == exp["receipt_id"] and exp["issue_contains"].lower() in r.get("issue", "").lower() and _money_eq(r.get("allowed_amount", ""), exp["allowed_amount"])]
        add(f"issue_{exp['receipt_id']}", bool(hit), exp)
    add("no_currency_symbols_in_amounts", bool(rows) and all("$" not in (r.get("amount", "") + r.get("allowed_amount", "")) for r in rows))

    mp = w / "out" / "missing_docs.md"
    text = mp.read_text(encoding="utf-8", errors="replace") if mp.is_file() else ""
    add("missing_docs_exists", bool(text.strip()))
    add("missing_docs_lists_required_receipts", all(rid in text for rid in gt["missing_doc_receipts"]), text)
    add("missing_docs_total_claimed", gt["total_claimed"] in text or "$" + gt["total_claimed"] in text)
    add("missing_docs_total_allowed", gt["total_allowed"] in text or "$" + gt["total_allowed"] in text)

    score = sum(c["pass"] for c in checks) / len(checks) if checks else 0.0
    return {"task": "029-expense-packet-review", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}

Audit An Expense Reimbursement Packet