Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptVertical Professional Workflows · Task 6
You are auditing a research reproducibility package.
Read:
- $WORKSPACE/in/README.md
- $WORKSPACE/in/scripts/*
- $WORKSPACE/in/results.csv
- $WORKSPACE/in/paper_claims.md
Package intentionally mixes broken automation, missing raw inputs, published metric mismatches, and Appendix artifacts absent from the deposit.
Create:
- $WORKSPACE/out/reproducibility_report.md
- $WORKSPACE/out/missing_steps.csv
Requirements for reproducibility_report.md:
- Use one Markdown section per paper claim, headed exactly:
### Claim CLAIM-X(examples:### Claim CLAIM-1, …,### Claim CLAIM-4). - Inside each section map that claim to expected artifacts (
results.csv, scripts, subgroup CSV paths) and state whether evidence supports reproducibility honestly. - Explain why the main headline findings cannot be reproduced end-to-end from what ships today (missing raw bundle, corrupt driver script, CSV/paper deltas, absent subgroup tables).
- Explicitly tie numerical disagreement on retention lift (paper vs
results.csv) to CLAIM-2 without inventing rerun logs.
Requirements for missing_steps.csv:
- First row must match exactly: item,type,impact,recommended_fix
- Provide ≥ 4 rows, each with non-empty impact + recommended_fix.
- Rows must collectively cover:
- Missing trial-level raw inputs referenced by README
- Non-functional / syntax-corrupted analysis driver (
analyze_main.py) - Metric/table mismatch between paper CLAIM-2 and shipped
results.csv(retention_lift) - Missing Appendix D subgroup artifact (
results/subgroup_ablation.csv)
Encode those gaps using concise item/type wording—oracle checks normalized keywords.
Do not fabricate execution results or pretend reruns succeeded.
Do not use the network. Do not modify input files.
Input Files3 files
in/README.md
in/paper_claims.md
in/results.csv
LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\n"
+ """Evaluate the agent run for a research reproducibility audit. Deterministic checks cover CLAIM-section structure, Appendix/subgroup gaps, corrupted analyze_main.py awareness, CSV expectation patterns, numeric mismatch citations, and refusal to fake success language. Score only qualitative aspects:
- tool_use_appropriate: inspects README, scripts folder (including defective analyze_main.py), results.csv, and expanded claims list.
- consistency: separates headline metrics vs appendix subgroup artifacts before concluding non-reproducibility.
- robustness: refuses fabricated rerun logs while documenting SyntaxError-class defects precisely enough for downstream fixes.
"""
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
Optional **total**: harness recomputes mean of three process scores.
--- PROXY TRACE JSON BELOW ---
{payload}
"""
)Completion Grader
def _check(cid: str, label: str, ok: bool, weight: float, detail: str = "") -> dict[str, Any]:
return {"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": None if ok else detail}
def _first_line(path: Path) -> str:
raw = path.read_text(encoding="utf-8-sig", errors="replace").splitlines()
return raw[0].strip() if raw else ""
def _norm(text: str) -> str:
return text.lower().replace("_", " ").replace("-", " ")
def _claim_sections(md: str) -> dict[str, str]:
rx = re.compile(r"^###\s+Claim\s+(CLAIM-\d+)\s*$", re.MULTILINE)
ms = list(rx.finditer(md))
out: dict[str, str] = {}
for i, m in enumerate(ms):
cid = m.group(1)
start = m.end()
end = ms[i + 1].start() if i + 1 < len(ms) else len(md)
out[cid] = md[start:end]
return out
def _expect_rows(rows: list[dict[str, str]], expectations: list[dict[str, Any]]) -> tuple[bool, str]:
failures: list[str] = []
for exp in expectations:
label = exp.get("label", "")
item_needles = list(exp.get("item_needles") or [])
type_needles = list(exp.get("type_needles") or [])
matched = False
for r in rows:
impact = str(r.get("impact", "")).strip()
fix = str(r.get("recommended_fix", "")).strip()
if not impact or not fix:
continue
item_n = _norm(str(r.get("item", "")))
typ_n = _norm(str(r.get("type", "")))
if item_needles and not all(_norm(n) in item_n for n in item_needles):
continue
if type_needles and not any(_norm(t) in typ_n for t in type_needles):
continue
matched = True
break
if not matched:
failures.append(label or "expectation")
return not failures, "; ".join(failures)
def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
w = workspace.resolve()
gt = json.loads((ground_truth_path or TASK_DIR / "ground_truth.json").read_text(encoding="utf-8"))
report_path = w / gt["report_path"]
missing_path = w / gt["missing_path"]
checks: list[dict[str, Any]] = []
checks.append(_check("report_exists", "reproducibility_report.md exists", report_path.is_file(), 0.04))
checks.append(_check("missing_exists", "missing_steps.csv exists", missing_path.is_file(), 0.04))
text = report_path.read_text(encoding="utf-8", errors="replace") if report_path.exists() else ""
text_l = text.lower()
sections = _claim_sections(text) if text else {}
want_claims = set(gt["claim_ids"])
sections_ok = report_path.is_file() and set(sections.keys()) == want_claims
checks.append(_check("claim_sections", "report has ### Claim sections for each CLAIM id", sections_ok, 0.10))
terms_ok = all(term.lower() in text_l for term in gt["required_terms"])
checks.append(_check("required_terms", "report cites artifacts metrics and failures", terms_ok, 0.10))
forbid = list(gt.get("forbidden_success_phrases") or [])
no_false = report_path.is_file() and not any(p.lower() in text_l for p in forbid)
checks.append(_check("no_false_repro", "does not assert successful reproduction", no_false, 0.10))
script_audit = report_path.is_file() and ("analyze_main.py" in text or "analyze_main" in text_l) and any(
k in text_l for k in ["syntax", "parse", "indent", "broken", "invalid", "corrupt", "error"]
)
checks.append(_check("script_issue_called_out", "report flags analyze_main packaging defect", script_audit, 0.08))
rows: list[dict[str, str]] = []
header_line_ok = missing_path.is_file() and _first_line(missing_path) == gt["missing_steps_header"]
cols_ok = False
if missing_path.exists():
with missing_path.open("r", encoding="utf-8-sig", newline="") as f:
rows = list(csv.DictReader(f))
cols_ok = bool(rows) and set(rows[0].keys()) == {"item", "type", "impact", "recommended_fix"}
checks.append(_check("missing_csv_header_line", "missing_steps header row exact", header_line_ok, 0.04))
checks.append(_check("missing_csv_columns", "missing_steps DictReader columns", cols_ok, 0.04))
exp_list = list(gt.get("missing_expectations") or [])
min_rows_ok = len(rows) >= len(exp_list)
checks.append(_check("missing_minimum_rows", "missing_steps lists each blocking gap", min_rows_ok, 0.06))
miss_ok, miss_detail = _expect_rows(rows, exp_list)
checks.append(
_check(
"missing_expectation_coverage",
"CSV rows cover raw/script/mismatch/subgroup gaps",
miss_ok,
0.40,
miss_detail,
)
)
total_w = sum(c["weight"] for c in checks)
score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
if miss_detail and not miss_ok:
for c in checks:
if c["id"] == "missing_expectation_coverage" and not c["pass"]:
c["detail"] = miss_detail
return {"task": "073-research-repro-package", "workspace": str(w), "outcome_score": score, "checks": checks}