Education assignment grading and feedback

You are grading short student responses with a fixed rubric.

Vertical Professional WorkflowsTask 7Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptVertical Professional Workflows ยท Task 7

You are grading short student responses with a fixed rubric.

Read:

  • $WORKSPACE/in/rubric.md
  • $WORKSPACE/in/submissions/*.txt

Create:

  • $WORKSPACE/out/grades.csv
  • $WORKSPACE/out/feedback/*.md

Requirements for $WORKSPACE/out/grades.csv:

  • Header must be: submission_id,concept_score,evidence_score,clarity_score,total_score
  • Include one row for each submission.
  • Scores must follow rubric.md. concept_score max 4, evidence_score max 3, clarity_score max 3, total max 10.

Requirements for $WORKSPACE/out/feedback/*.md:

  • Create one feedback markdown file per submission id, e.g. $WORKSPACE/out/feedback/submission_1.md.
  • Each file must mention at least one strength and one improvement.
  • Feedback must be specific to that submission.

Do not use the network. Do not modify input files.

Input Files4 files
in/rubric.md
in/submissions/submission_1.txt
in/submissions/submission_2.txt
in/submissions/submission_3.txt
LLM Rubric
USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Evaluate the agent run for grading and feedback. Deterministic checks cover exact scores, totals, and feedback file coverage. Score only qualitative aspects:
- tool_use_appropriate: reads rubric and all submissions.
- consistency: applies rubric before writing grades and feedback.
- robustness: handles partial answers without inventing student intent.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}
"""
)
Completion Grader
def _check(cid: str, label: str, ok: bool, weight: float, detail: str = "") -> dict[str, Any]:
    return {"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": None if ok else detail}


def score_workspace(workspace: Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = workspace.resolve()
    gt = json.loads((ground_truth_path or TASK_DIR / "ground_truth.json").read_text(encoding="utf-8"))
    grades = w / gt["grades_path"]
    feedback_dir = w / gt["feedback_dir"]
    checks: list[dict[str, Any]] = []
    checks.append(_check("grades_exists", "grades.csv exists", grades.is_file(), 0.10))
    rows: list[dict[str, str]] = []
    if grades.exists():
        with grades.open("r", encoding="utf-8", newline="") as f:
            rows = list(csv.DictReader(f))
    by_id = {r.get("submission_id", ""): r for r in rows}
    checks.append(_check("all_submissions", "grades include every submission", set(by_id) == set(gt["expected_scores"]), 0.16))
    exact_scores = True
    totals_ok = True
    for sid, exp in gt["expected_scores"].items():
        row = by_id.get(sid, {})
        for field, val in exp.items():
            exact_scores = exact_scores and row.get(field, "") == str(val)
        try:
            totals_ok = totals_ok and int(row["concept_score"]) + int(row["evidence_score"]) + int(row["clarity_score"]) == int(row["total_score"])
        except Exception:
            totals_ok = False
    checks.append(_check("exact_scores", "scores match rubric expectations", exact_scores, 0.34))
    checks.append(_check("total_math", "total_score equals component sum", totals_ok, 0.14))
    files_ok = all((feedback_dir / f"{sid}.md").is_file() for sid in gt["expected_scores"])
    checks.append(_check("feedback_files", "one feedback file per submission", files_ok, 0.12))
    feedback_ok = True
    for sid in gt["expected_scores"]:
        p = feedback_dir / f"{sid}.md"
        t = p.read_text(encoding="utf-8", errors="replace").lower() if p.exists() else ""
        feedback_ok = feedback_ok and sid in t and all(term in t for term in gt["feedback_required_terms"])
    checks.append(_check("feedback_content", "feedback mentions submission id, strength, and improvement", feedback_ok, 0.14))
    total_w = sum(c["weight"] for c in checks)
    score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
    return {"task": "074-education-grading-feedback", "workspace": str(w), "outcome_score": score, "checks": checks}