Audit And Repair Cross-Document Citations

Audit report citations against the appendix and source files, then provide a repaired reference section.

Office & Business CommunicationTask 11Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptOffice & Business Communication Β· Task 11

Audit report citations against the appendix and source files, then provide a repaired reference section.

Inputs:

  • $WORKSPACE/in/report.md
  • $WORKSPACE/in/appendix.md
  • $WORKSPACE/in/sources/*.md

Outputs:

  • $WORKSPACE/out/citation_audit.csv
  • $WORKSPACE/out/fixed_references.md

citation_audit.csv requirements:

  • CSV header: citation_id,location,issue,expected_source,recommended_fix
  • Include each incorrect, missing, or unsupported citation.

fixed_references.md requirements:

  • Provide corrected references for all claims in report.md.
  • Use format: [S1] title - file path
  • Do not invent sources.
  • Do not modify input files.
Input Files6 files
in/appendix.md
in/report.md
in/sources/csat.md
in/sources/escalations.md
in/sources/help_center.md
in/sources/latency.md
LLM Rubric
_REF = """
Cue: evidence-mapping explanations, readability of corrections, restraint against fabricated sources, audit actionable clarity β€”
encode only as **tool_use_appropriate** / **consistency** / **robustness**.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "**Outcome**:oracleοΌ›ζœ¬ζ‘δΈ‰η»΄ + **security_gate**。\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)
Completion Grader
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = json.loads((ground_truth_path or _GT).read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, detail: Any = None) -> None:
        checks.append({"id": cid, "label": cid.replace("_", " "), "pass": bool(ok), "weight": 1.0, "detail": detail})

    rows: list[dict[str, str]] = []
    p = w / "out" / "citation_audit.csv"
    if p.is_file():
        try:
            rows = list(csv.DictReader(p.open(newline="", encoding="utf-8")))
            add("audit_csv_parseable", True)
        except Exception as exc:
            add("audit_csv_parseable", False, str(exc))
    else:
        add("audit_csv_exists", False, "missing")
    add("audit_header_exact", bool(rows) and list(rows[0].keys()) == ["citation_id", "location", "issue", "expected_source", "recommended_fix"], list(rows[0].keys()) if rows else None)
    add("exactly_three_audit_rows", len(rows) == 3, len(rows))
    for exp in gt["expected_audit"]:
        hit = [r for r in rows if r.get("citation_id") == exp["citation_id"] and exp["location"].lower() in r.get("location", "").lower() and exp["issue_contains"].lower() in r.get("issue", "").lower() and r.get("expected_source") == exp["expected_source"]]
        add(f"audit_{exp['location'].replace(' ', '_')}", bool(hit), exp)

    fp = w / "out" / "fixed_references.md"
    text = fp.read_text(encoding="utf-8", errors="replace") if fp.is_file() else ""
    add("fixed_references_exists", bool(text.strip()))
    missing = [r for r in gt["fixed_refs"] if r not in text]
    add("fixed_references_complete", not missing, missing)
    forbidden = [r for r in gt["forbidden_refs"] if r.lower() in text.lower()]
    add("fixed_references_avoid_retired_sources", not forbidden, forbidden)

    score = sum(c["pass"] for c in checks) / len(checks) if checks else 0.0
    return {"task": "031-cross-doc-citation-check", "workspace": str(w), "outcome_score": round(score, 4), "checks": checks}