Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptKnowledge, Evidence & Retrieval ยท Task 6
Task: Citation Consistency Audit
Inputs:
$WORKSPACE/in/paper_draft.md$WORKSPACE/in/bib.json
Audit consistency between in-text citations and the bibliography, and create these outputs:
$WORKSPACE/out/citation_errors.csv
- Header must be
error_type,citation_key,details,expected_fix,evidence_span. - Cover at least: citations present in the draft but missing from
bib.json, bibliography entries not cited in the draft, author mismatches, and year mismatches. - Also cover: same-author same-year suffix needs such as 2024a/2024b, duplicate or conflicting bibliography keys, and title/DOI inconsistencies.
error_typemust use this enum:missing_bib,orphan_bib,author_mismatch,year_mismatch,suffix_needed,duplicate_key,doi_title_mismatch.- Put each error point on its own row; do not merge multiple
citation_keyvalues into one row. evidence_spanmust contain a short span from the draft or bibliography supporting the error judgment; do not use only a filename.
$WORKSPACE/out/corrected_bib.json
- Must be valid JSON.
- Correct author/year errors that can be determined from the draft.
- Remove orphan bibliography entries and add draft citations that are missing from
bib.json; fields that cannot be determined may use"UNKNOWN". - Do not create a separate bibliography entry for the indirect Rao mention unless the draft reference list explicitly lists one.
$WORKSPACE/out/audit_notes.md
- Explain the basis for each error type.
- Clearly state which fields use
UNKNOWN, and state that no internet lookup was used. - Explain the ordering basis for the 2024a/2024b suffixes and why Rao should not become a standalone bibliography entry.
$WORKSPACE/out/citation_graph.json
- JSON object with keys
in_text_citations,bibliography_entries, andkey_renames. in_text_citationsmust map each draft citation signal to the corrected bibliography key or to a missing/orphan/error status.bibliography_entriesmust identify whether each original bibliography entry was kept, renamed, removed, or split.key_renamesmust explicitly include the Lin, Ortega, and Chen key changes.
Forbidden: do not use the internet; do not modify fixtures; do not write citation_errors.csv as a Markdown table.
Input Files2 files
in/bib.json
in/paper_draft.md
LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n" + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS + "\n\n" + _dr.RUBRIC_USER_OUTCOME_NOTICE + "\n\nEvaluate citation consistency auditing. Reward careful extraction of in-text citations, comparison with bib.json, precise CSV errors, valid corrected JSON, and conservative UNKNOWN use for truly unavailable data.\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON: {{\"scores\": {{\"tool_use_appropriate\": 0.0, \"consistency\": 0.0, \"robustness\": 0.0}}, \"security_gate\": 1, \"notes\": \"one line\"}}\n\n--- PROXY TRACE JSON BELOW ---\n{payload}"
)Completion Grader
def _norm(value: Any) -> str:
return str(value or "").strip().lower()
def _row_matches(row: dict[str, Any], expected_key: str, terms: list[str]) -> bool:
span = _norm(row.get("evidence_span"))
details = _norm(row.get("details"))
fix = _norm(row.get("expected_fix"))
text = " ".join([span, details, fix])
return bool(span) and all(_norm(term) in text for term in terms)
def score_workspace(workspace: Path) -> dict[str, Any]:
w = Path(workspace).resolve()
gt = json.loads((TASK_DIR / "ground_truth.json").read_text(encoding="utf-8"))
err_path = w / "out" / "citation_errors.csv"
bib_path = w / "out" / "corrected_bib.json"
notes_path = w / "out" / "audit_notes.md"
graph_path = w / "out" / "citation_graph.json"
checks: list[dict[str, Any]] = []
error_score = corrected_score = format_score = 0.0
try:
with err_path.open("r", encoding="utf-8", newline="") as fh:
rows = list(csv.DictReader(fh))
cols_ok = rows and {"error_type", "citation_key", "details", "expected_fix", "evidence_span"}.issubset(rows[0].keys())
format_score += 0.5 if cols_ok else 0.0
rows_by_key: dict[str, list[dict[str, Any]]] = {}
for row in rows:
key = f"{row.get('error_type', '').strip()}:{row.get('citation_key', '').strip()}"
rows_by_key.setdefault(key, []).append(row)
no_merged_keys = all("/" not in row.get("citation_key", "") and "," not in row.get("citation_key", "") for row in rows)
hits = 0
evidence_hits = 0
expected = []
for etype, keys in gt["errors"].items():
expected.extend((etype, key) for key in keys)
consumed_rows: set[tuple[str, int]] = set()
for item in expected:
canonical = f"{item[0]}:{item[1]}"
candidates = [canonical, *gt.get("error_aliases", {}).get(canonical, [])]
matched: tuple[str, int] | None = None
for candidate in candidates:
for idx, _row in enumerate(rows_by_key.get(candidate, [])):
row_ref = (candidate, idx)
if row_ref not in consumed_rows:
matched = row_ref
break
if matched is not None:
break
if matched is not None:
consumed_rows.add(matched)
hits += 1
for key, terms in gt.get("evidence_span_terms", {}).items():
keys = [key, *gt.get("error_aliases", {}).get(key, [])]
candidate_rows = [row for k in keys for row in rows_by_key.get(k, [])]
if any(_row_matches(row, key, terms) for row in candidate_rows):
evidence_hits += 1
row_score = hits / len(expected)
span_score = evidence_hits / max(len(gt.get("evidence_span_terms", {})), 1)
count_score = 1.0 if len(rows) == len(expected) and no_merged_keys else 0.0
error_score = 0.65 * row_score + 0.25 * span_score + 0.10 * count_score
checks.append({"id": "errors_csv", "label": "citation_errors.csv covers expected error rows and evidence spans", "pass": error_score >= 1.0 and cols_ok, "weight": 0.40, "detail": {"hits": hits, "expected": len(expected), "evidence_hits": evidence_hits, "rows": len(rows), "no_merged_keys": no_merged_keys}})
except Exception as exc:
checks.append({"id": "errors_parse", "label": "citation_errors.csv parseable", "pass": False, "weight": 0.40, "detail": str(exc)})
try:
bib = json.loads(bib_path.read_text(encoding="utf-8"))
format_score += 0.5 if isinstance(bib, dict) else 0.0
required = gt["corrected"]["required_keys"]
forbidden = gt["corrected"]["forbidden_keys"]
key_score = 0.5 * (sum(1 for key in required if key in bib) / len(required)) + 0.2 * (sum(1 for key in forbidden if key not in bib) / len(forbidden))
field_hits = 0
field_total = 0
for key, exp in gt["corrected"]["field_expectations"].items():
entry = bib.get(key, {})
if "year" in exp:
field_total += 1
actual_year = str(entry.get("year"))
expected_year = str(exp["year"])
if expected_year in {"2024a", "2024b"}:
field_hits += int(actual_year in {expected_year, "2024"})
else:
field_hits += int(actual_year == expected_year)
if "title" in exp:
field_total += 1
field_hits += int(_norm(entry.get("title")) == _norm(exp["title"]))
if "authors_contain" in exp:
field_total += 1
authors = json.dumps(entry.get("authors", []), ensure_ascii=False).lower()
field_hits += int(all(token.lower() in authors for token in exp["authors_contain"]))
corrected_score = min(1.0, key_score + 0.3 * (field_hits / max(field_total, 1)))
checks.append({"id": "corrected_bib", "label": "corrected_bib.json has corrected keys and fields", "pass": corrected_score >= 0.85, "weight": 0.40, "detail": {"field_hits": field_hits, "field_total": field_total}})
except Exception as exc:
checks.append({"id": "bib_parse", "label": "corrected_bib.json parseable", "pass": False, "weight": 0.40, "detail": str(exc)})
checks.append({"id": "format", "label": "required output formats are valid CSV and JSON", "pass": format_score >= 1.0, "weight": 0.10, "detail": {"score": format_score}})
notes_score = 0.0
if notes_path.is_file():
notes = notes_path.read_text(encoding="utf-8", errors="replace").lower()
aliases = gt.get("audit_term_aliases", {})
hits = 0
for term in gt.get("audit_terms", []):
candidates = [term, *aliases.get(term, [])]
hits += int(any(candidate.lower() in notes for candidate in candidates))
notes_score = hits / max(len(gt.get("audit_terms", [])), 1)
checks.append({"id": "audit_notes", "label": "audit_notes.md explains unknowns, suffixes, and no external lookup", "pass": notes_score >= 0.75, "weight": 0.10, "detail": {"score": notes_score}})
graph_score = 0.0
try:
graph = json.loads(graph_path.read_text(encoding="utf-8"))
keys_ok = isinstance(graph, dict) and {"in_text_citations", "bibliography_entries", "key_renames"}.issubset(graph)
text = json.dumps(graph, ensure_ascii=False)
terms = gt.get("citation_graph_terms", [])
graph_score = 0.25 * bool(keys_ok) + 0.75 * (sum(term in text for term in terms) / max(len(terms), 1))
checks.append({"id": "citation_graph", "label": "citation_graph.json maps draft citations to corrected bibliography actions", "pass": graph_score >= 0.85, "weight": 0.10, "detail": {"score": round(graph_score, 4)}})
except Exception as exc:
checks.append({"id": "citation_graph_parse", "label": "citation_graph.json parseable", "pass": False, "weight": 0.10, "detail": str(exc)})
total = 0.45 * error_score + 0.27 * corrected_score + 0.08 * format_score + 0.10 * notes_score + 0.10 * graph_score
caps = []
if error_score < 0.95:
caps.append(0.78)
if corrected_score < 0.85:
caps.append(0.74)
if format_score < 1.0:
caps.append(0.70)
if graph_score < 0.60:
caps.append(0.84)
if caps:
total = min(total, min(caps))
th = gt["scoring"]["thresholds"]
level = "excellent" if total >= th["excellent"] else "good" if total >= th["good"] else "pass" if total >= th["pass"] else "fail"
return {"task": "036-citation-consistency-audit", "workspace": str(w), "outcome_score": round(float(total), 4), "level": level, "checks": checks}