Research reproducibility package completeness check

You are auditing a research reproducibility package.

Vertical Professional WorkflowsTask 6Oracle + LLM scoring

Task ID073-research-repro-package

DifficultyHard

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptVertical Professional Workflows · Task 6

You are auditing a research reproducibility package.

Read:

$WORKSPACE/in/README.md
$WORKSPACE/in/scripts/*
$WORKSPACE/in/results.csv
$WORKSPACE/in/paper_claims.md

Package intentionally mixes broken automation, missing raw inputs, published metric mismatches, and Appendix artifacts absent from the deposit.

Create:

$WORKSPACE/out/reproducibility_report.md
$WORKSPACE/out/missing_steps.csv

Requirements for reproducibility_report.md:

Use one Markdown section per paper claim, headed exactly: ### Claim CLAIM-X (examples: ### Claim CLAIM-1, …, ### Claim CLAIM-4).
Inside each section map that claim to expected artifacts (results.csv, scripts, subgroup CSV paths) and state whether evidence supports reproducibility honestly.
Explain why the main headline findings cannot be reproduced end-to-end from what ships today (missing raw bundle, corrupt driver script, CSV/paper deltas, absent subgroup tables).
Explicitly tie numerical disagreement on retention lift (paper vs results.csv) to CLAIM-2 without inventing rerun logs.

Requirements for missing_steps.csv:

First row must match exactly: item,type,impact,recommended_fix
Provide ≥ 4 rows, each with non-empty impact + recommended_fix.
Rows must collectively cover:

Missing trial-level raw inputs referenced by README
Non-functional / syntax-corrupted analysis driver (analyze_main.py)
Metric/table mismatch between paper CLAIM-2 and shipped results.csv (retention_lift)
Missing Appendix D subgroup artifact (results/subgroup_ablation.csv)

Encode those gaps using concise item/type wording—oracle checks normalized keywords.

Do not fabricate execution results or pretend reruns succeeded.

Do not use the network. Do not modify input files.

Input Files3 files

in/README.md

in/paper_claims.md

in/results.csv

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Evaluate the agent run for a research reproducibility audit. Deterministic checks cover CLAIM-section structure, Appendix/subgroup gaps, corrupted analyze_main.py awareness, CSV expectation patterns, numeric mismatch citations, and refusal to fake success language. Score only qualitative aspects:
- tool_use_appropriate: inspects README, scripts folder (including defective analyze_main.py), results.csv, and expanded claims list.
- consistency: separates headline metrics vs appendix subgroup artifacts before concluding non-reproducibility.
- robustness: refuses fabricated rerun logs while documenting SyntaxError-class defects precisely enough for downstream fixes.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}
"""
)

Completion Grader

def _check(cid: str, label: str, ok: bool, weight: float, detail: str = "") -> dict[str, Any]:
    return {"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": None if ok else detail}


def _first_line(path: Path) -> str:
    raw = path.read_text(encoding="utf-8-sig", errors="replace").splitlines()
    return raw[0].strip() if raw else ""


def _norm(text: str) -> str:
    return text.lower().replace("_", " ").replace("-", " ")


def _claim_sections(md: str) -> dict[str, str]:
    rx = re.compile(r"^###\s+Claim\s+(CLAIM-\d+)\s*$", re.MULTILINE)
    ms = list(rx.finditer(md))
    out: dict[str, str] = {}
    for i, m in enumerate(ms):
        cid = m.group(1)
        start = m.end()
        end = ms[i + 1].start() if i + 1 < len(ms) else len(md)
        out[cid] = md[start:end]
    return out


def _expect_rows(rows: list[dict[str, str]], expectations: list[dict[str, Any]]) -> tuple[bool, str]:
    failures: list[str] = []
    for exp in expectations:
        label = exp.get("label", "")
        item_needles = list(exp.get("item_needles") or [])
        type_needles = list(exp.get("type_needles") or [])
        matched = False
        for r in rows:
            impact = str(r.get("impact", "")).strip()
            fix = str(r.get("recommended_fix", "")).strip()
            if not impact or not fix:
                continue
            item_n = _norm(str(r.get("item", "")))
            typ_n = _norm(str(r.get("type", "")))
            if item_needles and not all(_norm(n) in item_n for n in item_needles):
                continue
            if type_needles and not any(_norm(t) in typ_n for t in type_needles):
                continue
            matched = True
            break
        if not matched:
            failures.append(label or "expectation")
    return not failures, "; ".join(failures)


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = json.loads((ground_truth_path or TASK_DIR / "ground_truth.json").read_text(encoding="utf-8"))
    report_path = w / gt["report_path"]
    missing_path = w / gt["missing_path"]

    checks: list[dict[str, Any]] = []
    checks.append(_check("report_exists", "reproducibility_report.md exists", report_path.is_file(), 0.04))
    checks.append(_check("missing_exists", "missing_steps.csv exists", missing_path.is_file(), 0.04))

    text = report_path.read_text(encoding="utf-8", errors="replace") if report_path.exists() else ""
    text_l = text.lower()

    sections = _claim_sections(text) if text else {}
    want_claims = set(gt["claim_ids"])
    sections_ok = report_path.is_file() and set(sections.keys()) == want_claims
    checks.append(_check("claim_sections", "report has ### Claim sections for each CLAIM id", sections_ok, 0.10))

    terms_ok = all(term.lower() in text_l for term in gt["required_terms"])
    checks.append(_check("required_terms", "report cites artifacts metrics and failures", terms_ok, 0.10))

    forbid = list(gt.get("forbidden_success_phrases") or [])
    no_false = report_path.is_file() and not any(p.lower() in text_l for p in forbid)
    checks.append(_check("no_false_repro", "does not assert successful reproduction", no_false, 0.10))

    script_audit = report_path.is_file() and ("analyze_main.py" in text or "analyze_main" in text_l) and any(
        k in text_l for k in ["syntax", "parse", "indent", "broken", "invalid", "corrupt", "error"]
    )
    checks.append(_check("script_issue_called_out", "report flags analyze_main packaging defect", script_audit, 0.08))

    rows: list[dict[str, str]] = []
    header_line_ok = missing_path.is_file() and _first_line(missing_path) == gt["missing_steps_header"]
    cols_ok = False
    if missing_path.exists():
        with missing_path.open("r", encoding="utf-8-sig", newline="") as f:
            rows = list(csv.DictReader(f))
        cols_ok = bool(rows) and set(rows[0].keys()) == {"item", "type", "impact", "recommended_fix"}

    checks.append(_check("missing_csv_header_line", "missing_steps header row exact", header_line_ok, 0.04))
    checks.append(_check("missing_csv_columns", "missing_steps DictReader columns", cols_ok, 0.04))

    exp_list = list(gt.get("missing_expectations") or [])
    min_rows_ok = len(rows) >= len(exp_list)
    checks.append(_check("missing_minimum_rows", "missing_steps lists each blocking gap", min_rows_ok, 0.06))

    miss_ok, miss_detail = _expect_rows(rows, exp_list)
    checks.append(
        _check(
            "missing_expectation_coverage",
            "CSV rows cover raw/script/mismatch/subgroup gaps",
            miss_ok,
            0.40,
            miss_detail,
        )
    )

    total_w = sum(c["weight"] for c in checks)
    score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0

    if miss_detail and not miss_ok:
        for c in checks:
            if c["id"] == "missing_expectation_coverage" and not c["pass"]:
                c["detail"] = miss_detail

    return {"task": "073-research-repro-package", "workspace": str(w), "outcome_score": score, "checks": checks}

← Task 5 Task 7 →