Service dependency triage with logs, topology, and metrics

You are triaging a production checkout incident from offline evidence.

SRE, DevOps & Release OpsTask 4Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSRE, DevOps & Release Ops ยท Task 4

You are triaging a production checkout incident from offline evidence.

Inputs:

  • $WORKSPACE/in/topology.json
  • $WORKSPACE/in/metrics/service_metrics.csv
  • $WORKSPACE/in/logs/checkout-web.log
  • $WORKSPACE/in/logs/payment-api.log
  • $WORKSPACE/in/logs/auth-gateway.log
  • $WORKSPACE/in/logs/orders-db.log

Create exactly these output files:

  • $WORKSPACE/out/root_cause.json
  • $WORKSPACE/out/triage_notes.md

root_cause.json requirements:

  • Top-level object with: incident_id, root_cause_service, root_cause_change_id, confidence, affected_services, evidence, excluded_red_herrings, recommended_next_steps.
  • evidence must be a list of objects with source_file and observation.
  • excluded_red_herrings must name misleading signals and why they are not the primary cause.

triage_notes.md requirements:

  • Summarize the dependency path from user impact to root cause.
  • Cite at least three fixture files by path.
  • Include immediate mitigation and verification steps.
  • Explicitly mark inferences vs directly observed facts.

Do not modify input files. Do not use live services or external APIs.

Input Files6 files
in/logs/auth-gateway.log
in/logs/checkout-web.log
in/logs/orders-db.log
in/logs/payment-api.log
in/metrics/service_metrics.csv
in/topology.json
LLM Rubric
USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\nEvaluate process quality only: evidence discipline, separation of fact/inference, and safe mitigation language. "
    "Do not override deterministic checks for exact root cause fields.\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

--- PROXY TRACE JSON BELOW ---
{payload}"""
)
Completion Grader
def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _norm(value: Any) -> str:
    return str(value or "").strip().lower()


def _contains_all(text: str, tokens: list[str]) -> int:
    low = text.lower()
    return sum(1 for token in tokens if token.lower() in low)


def _source_unchanged(workspace: Path) -> bool:
    root = _TASK_DIR / "fixtures" / "in"
    src = workspace.resolve()
    if not src.is_dir():
        return True
    for original in root.rglob("*"):
        if not original.is_file():
            continue
        rel = original.relative_to(root)
        candidate = src / "in" / rel
        if candidate.is_file() and candidate.read_bytes() != original.read_bytes():
            return False
    return True


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _load_json(_GT)
    exp = gt["expected"]
    weights = gt["scoring"]["weights"]
    checks: list[dict[str, Any]] = []

    def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})

    json_score = 0.0
    path = w / "out" / "root_cause.json"
    if path.is_file():
        try:
            data = _load_json(path)
            affected_values = data.get("affected_services", [])
            affected = {str(x).lower() for x in affected_values} if isinstance(affected_values, list) else set()
            evidence_text = json.dumps(data.get("evidence", ""), ensure_ascii=False).lower()
            red_text = json.dumps(data.get("excluded_red_herrings", ""), ensure_ascii=False).lower()
            source_hits = _contains_all(evidence_text, exp["evidence_sources"])
            affected_hits = len(affected & {x.lower() for x in exp["affected_services"]})
            red_hits = _contains_all(red_text, exp["red_herrings"])
            confidence_ok = str(data.get("confidence", "")).lower() in {"high", "medium", "0.8", "0.9", "0.95"} or isinstance(data.get("confidence"), (int, float))
            json_score = (
                0.22 * (_norm(data.get("incident_id")) == gt["incident_id"].lower())
                + 0.24 * (_norm(data.get("root_cause_service")) == exp["root_cause_service"])
                + 0.18 * (_norm(data.get("root_cause_change_id")) == exp["root_cause_change_id"].lower())
                + 0.14 * (affected_hits / len(exp["affected_services"]))
                + 0.12 * min(source_hits / 4, 1)
                + 0.07 * min(red_hits / len(exp["red_herrings"]), 1)
                + 0.03 * confidence_ok
            )
            add("root_cause_json", "root_cause.json identifies root cause, evidence, and red herrings", json_score >= 0.70, weights["root_cause_json"], {"score": round(json_score, 4), "source_hits": source_hits, "red_hits": red_hits})
        except Exception as exc:
            add("root_cause_parse", "root_cause.json parseable", False, weights["root_cause_json"], str(exc))
    else:
        add("root_cause_missing", "root_cause.json exists", False, weights["root_cause_json"], "missing")

    notes_score = 0.0
    notes = w / "out" / "triage_notes.md"
    if notes.is_file():
        text = notes.read_text(encoding="utf-8", errors="replace")
        keyword_hits = _contains_all(text, gt["notes_keywords"])
        file_hits = _contains_all(text, exp["evidence_sources"])
        fact_inference_ok = bool(re.search(r"fact|observed|inference|inferred", text, re.IGNORECASE))
        notes_score = 0.45 * (keyword_hits / len(gt["notes_keywords"])) + 0.40 * min(file_hits / 3, 1) + 0.15 * fact_inference_ok
        add("triage_notes", "triage_notes.md cites sources and separates facts from inference", notes_score >= 0.70, weights["triage_notes"], {"score": round(notes_score, 4)})
    else:
        add("triage_notes_missing", "triage_notes.md exists", False, weights["triage_notes"], "missing")

    unchanged = _source_unchanged(w)
    add("source_unchanged", "fixtures are not modified", unchanged, weights["source_unchanged"])
    total = json_score * weights["root_cause_json"] + notes_score * weights["triage_notes"] + float(unchanged) * weights["source_unchanged"]
    thresholds = gt["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "064-service-dependency-triage", "workspace": str(w), "outcome_score": round(total, 4), "level": level, "checks": checks}