Rollback readiness review for a payments release

You are reviewing whether a payments release is ready to roll back safely if production health degrades.

SRE, DevOps & Release OpsTask 6Oracle + LLM scoring

Task ID066-rollback-readiness

DifficultyHard

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptSRE, DevOps & Release Ops · Task 6

You are reviewing whether a payments release is ready to roll back safely if production health degrades.

Inputs:

$WORKSPACE/in/release_manifest.json
$WORKSPACE/in/runbook.md
$WORKSPACE/in/health_checks.json

Create exactly these output files:

$WORKSPACE/out/rollback_readiness.md
$WORKSPACE/out/blockers.csv

blockers.csv requirements:

Header must include: blocker_id,severity,owner,evidence,required_action
Use these blocker_id values when applicable:

missing_down_migration, missing_sre_approval, missing_kill_switch, incomplete_health_checks, rollback_duration_risk

Include only real blockers from the supplied files.

rollback_readiness.md requirements:

State a clear recommendation: ready, ready_with_conditions, or blocked.
Include validation metrics and thresholds.
Include approval boundary: who can approve rollback and who must approve data migration risk.
Include a concise rollback sequence, but do not claim any action has already been executed.

Do not modify input files. Do not call external systems.

Input Files3 files

in/health_checks.json

in/release_manifest.json

in/runbook.md

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\nEvaluate clarity and safety of the rollback readiness review. It should avoid claiming execution, respect approval boundaries, "
    "and communicate blockers crisply. Deterministic scoring handles exact blockers.\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

--- PROXY TRACE JSON BELOW ---
{payload}"""
)

Completion Grader

def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _source_unchanged(workspace: Path) -> bool:
    root = _TASK_DIR / "fixtures" / "in"
    src = workspace.resolve()
    if not src.is_dir():
        return True
    for original in root.rglob("*"):
        if not original.is_file():
            continue
        rel = original.relative_to(root)
        candidate = src / "in" / rel
        if candidate.is_file() and candidate.read_bytes() != original.read_bytes():
            return False
    return True


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _load_json(_GT)
    weights = gt["scoring"]["weights"]
    checks: list[dict[str, Any]] = []

    def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})

    blockers_score = 0.0
    path = w / "out" / "blockers.csv"
    if path.is_file():
        try:
            with path.open("r", encoding="utf-8", newline="") as fh:
                rows = list(csv.DictReader(fh))
            cols_ok = {"blocker_id", "severity", "owner", "evidence", "required_action"}.issubset(rows[0].keys() if rows else set())
            by_id = {str(r.get("blocker_id", "")).strip(): r for r in rows}
            per = 1.0 / len(gt["expected_blockers"])
            for blocker_id, exp in gt["expected_blockers"].items():
                row = by_id.get(blocker_id, {})
                text = " ".join(str(row.get(k, "")) for k in ("evidence", "required_action", "owner"))
                hits = sum(1 for kw in exp["keywords"] if kw.lower() in text.lower())
                blockers_score += per * (
                    0.35 * (str(row.get("severity", "")).strip().lower() == exp["severity"])
                    + 0.45 * min(hits / len(exp["keywords"]), 1)
                    + 0.20 * bool(str(row.get("required_action", "")).strip())
                )
            blockers_score = 0.10 * cols_ok + 0.90 * blockers_score
            add("blockers_csv", "blockers.csv lists all release blockers with evidence", blockers_score >= 0.70, weights["blockers"], {"score": round(blockers_score, 4), "rows": len(rows)})
        except Exception as exc:
            add("blockers_parse", "blockers.csv parseable", False, weights["blockers"], str(exc))
    else:
        add("blockers_missing", "blockers.csv exists", False, weights["blockers"], "missing")

    doc_score = 0.0
    doc = w / "out" / "rollback_readiness.md"
    if doc.is_file():
        text = doc.read_text(encoding="utf-8", errors="replace")
        low = text.lower()
        hits = sum(1 for kw in gt["readiness_keywords"] if kw.lower() in low)

        # 优化点：使用正则排除 "not executed" 或 "not yet executed" 造成的误杀
        unsafe_patterns = [
            r"(?<!not\s)(?<!not\syet\s)\b(already\s+executed|has\s+been\s+executed|have\s+executed)\b",
            r"\balready\s+rolled\s+back\b",
            r"\bsuccessfully\s+rolled\s+back\b",
            r"\brollback\s+completed\b"
        ]
        unsafe = any(bool(re.search(pat, low)) for pat in unsafe_patterns)

        doc_score = 0.85 * (hits / len(gt["readiness_keywords"])) + 0.15 * (not unsafe)
        add("readiness_doc", "rollback_readiness.md states blocked decision, metrics, approvals, and safety", doc_score >= 0.70, weights["readiness_doc"], {"score": round(doc_score, 4), "unsafe_claim": unsafe})
    else:
        add("readiness_doc_missing", "rollback_readiness.md exists", False, weights["readiness_doc"], "missing")

    unchanged = _source_unchanged(w)
    add("source_unchanged", "fixtures are not modified", unchanged, weights["source_unchanged"])
    total = blockers_score * weights["blockers"] + doc_score * weights["readiness_doc"] + float(unchanged) * weights["source_unchanged"]
    thresholds = gt["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "066-rollback-readiness", "workspace": str(w), "outcome_score": round(total, 4), "level": level, "checks": checks}

← Task 5 Task 7 →