Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSRE, DevOps & Release Ops ยท Task 4
You are triaging a production checkout incident from offline evidence.
Inputs:
- $WORKSPACE/in/topology.json
- $WORKSPACE/in/metrics/service_metrics.csv
- $WORKSPACE/in/logs/checkout-web.log
- $WORKSPACE/in/logs/payment-api.log
- $WORKSPACE/in/logs/auth-gateway.log
- $WORKSPACE/in/logs/orders-db.log
Create exactly these output files:
- $WORKSPACE/out/root_cause.json
- $WORKSPACE/out/triage_notes.md
root_cause.json requirements:
- Top-level object with: incident_id, root_cause_service, root_cause_change_id, confidence, affected_services, evidence, excluded_red_herrings, recommended_next_steps.
- evidence must be a list of objects with source_file and observation.
- excluded_red_herrings must name misleading signals and why they are not the primary cause.
triage_notes.md requirements:
- Summarize the dependency path from user impact to root cause.
- Cite at least three fixture files by path.
- Include immediate mitigation and verification steps.
- Explicitly mark inferences vs directly observed facts.
Do not modify input files. Do not use live services or external APIs.
Input Files6 files
in/logs/auth-gateway.log
in/logs/checkout-web.log
in/logs/orders-db.log
in/logs/payment-api.log
in/metrics/service_metrics.csv
in/topology.json
LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\nEvaluate process quality only: evidence discipline, separation of fact/inference, and safe mitigation language. "
"Do not override deterministic checks for exact root cause fields.\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
--- PROXY TRACE JSON BELOW ---
{payload}"""
)Completion Grader
def _load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def _norm(value: Any) -> str:
return str(value or "").strip().lower()
def _contains_all(text: str, tokens: list[str]) -> int:
low = text.lower()
return sum(1 for token in tokens if token.lower() in low)
def _source_unchanged(workspace: Path) -> bool:
root = _TASK_DIR / "fixtures" / "in"
src = workspace.resolve()
if not src.is_dir():
return True
for original in root.rglob("*"):
if not original.is_file():
continue
rel = original.relative_to(root)
candidate = src / "in" / rel
if candidate.is_file() and candidate.read_bytes() != original.read_bytes():
return False
return True
def score_workspace(workspace: Path) -> dict[str, Any]:
w = workspace.resolve()
gt = _load_json(_GT)
exp = gt["expected"]
weights = gt["scoring"]["weights"]
checks: list[dict[str, Any]] = []
def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})
json_score = 0.0
path = w / "out" / "root_cause.json"
if path.is_file():
try:
data = _load_json(path)
affected_values = data.get("affected_services", [])
affected = {str(x).lower() for x in affected_values} if isinstance(affected_values, list) else set()
evidence_text = json.dumps(data.get("evidence", ""), ensure_ascii=False).lower()
red_text = json.dumps(data.get("excluded_red_herrings", ""), ensure_ascii=False).lower()
source_hits = _contains_all(evidence_text, exp["evidence_sources"])
affected_hits = len(affected & {x.lower() for x in exp["affected_services"]})
red_hits = _contains_all(red_text, exp["red_herrings"])
confidence_ok = str(data.get("confidence", "")).lower() in {"high", "medium", "0.8", "0.9", "0.95"} or isinstance(data.get("confidence"), (int, float))
json_score = (
0.22 * (_norm(data.get("incident_id")) == gt["incident_id"].lower())
+ 0.24 * (_norm(data.get("root_cause_service")) == exp["root_cause_service"])
+ 0.18 * (_norm(data.get("root_cause_change_id")) == exp["root_cause_change_id"].lower())
+ 0.14 * (affected_hits / len(exp["affected_services"]))
+ 0.12 * min(source_hits / 4, 1)
+ 0.07 * min(red_hits / len(exp["red_herrings"]), 1)
+ 0.03 * confidence_ok
)
add("root_cause_json", "root_cause.json identifies root cause, evidence, and red herrings", json_score >= 0.70, weights["root_cause_json"], {"score": round(json_score, 4), "source_hits": source_hits, "red_hits": red_hits})
except Exception as exc:
add("root_cause_parse", "root_cause.json parseable", False, weights["root_cause_json"], str(exc))
else:
add("root_cause_missing", "root_cause.json exists", False, weights["root_cause_json"], "missing")
notes_score = 0.0
notes = w / "out" / "triage_notes.md"
if notes.is_file():
text = notes.read_text(encoding="utf-8", errors="replace")
keyword_hits = _contains_all(text, gt["notes_keywords"])
file_hits = _contains_all(text, exp["evidence_sources"])
fact_inference_ok = bool(re.search(r"fact|observed|inference|inferred", text, re.IGNORECASE))
notes_score = 0.45 * (keyword_hits / len(gt["notes_keywords"])) + 0.40 * min(file_hits / 3, 1) + 0.15 * fact_inference_ok
add("triage_notes", "triage_notes.md cites sources and separates facts from inference", notes_score >= 0.70, weights["triage_notes"], {"score": round(notes_score, 4)})
else:
add("triage_notes_missing", "triage_notes.md exists", False, weights["triage_notes"], "missing")
unchanged = _source_unchanged(w)
add("source_unchanged", "fixtures are not modified", unchanged, weights["source_unchanged"])
total = json_score * weights["root_cause_json"] + notes_score * weights["triage_notes"] + float(unchanged) * weights["source_unchanged"]
thresholds = gt["scoring"]["thresholds"]
level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
return {"task": "064-service-dependency-triage", "workspace": str(w), "outcome_score": round(total, 4), "level": level, "checks": checks}