Kubernetes configuration audit for a checkout deployment

You are auditing an offline Kubernetes service configuration.

SRE, DevOps & Release OpsTask 2Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSRE, DevOps & Release Ops ยท Task 2

You are auditing an offline Kubernetes service configuration.

Inputs:

  • $WORKSPACE/in/deployment.yaml
  • $WORKSPACE/in/service.yaml
  • $WORKSPACE/in/policy.md

Create exactly this output file:

  • $WORKSPACE/out/k8s_audit.csv

CSV requirements:

  • Header must include: check_id,resource,status,severity,evidence,recommendation
  • One row per relevant finding.
  • status must be one of pass, fail, warn.
  • Use the exact check_id values when a finding applies:

cpu_memory_limits, readiness_probe, liveness_probe, service_selector, service_target_port, image_tag_policy, privileged_container, service_type_policy

Audit expectations:

  • Check container CPU and memory requests/limits.
  • Check readiness and liveness probes.
  • Check Deployment labels vs Service selector.
  • Check Service port/targetPort vs declared container ports.
  • Apply every forbidden or required item in policy.md.

Do not modify input files. Do not call the network or use live cluster tools. Base your answer only on the supplied files.

Input Files3 files
in/deployment.yaml
in/policy.md
in/service.yaml
LLM Rubric
USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\nEvaluate whether the Kubernetes audit is evidence-based, concise, and does not invent cluster state. "
    "The deterministic oracle owns factual scoring; use this rubric only for process, clarity, and safe recommendations.\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

--- PROXY TRACE JSON BELOW ---
{payload}"""
)
Completion Grader
def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def _norm(value: Any) -> str:
    return str(value or "").strip().lower()


def _source_unchanged(workspace: Path) -> bool:
    root = _TASK_DIR / "fixtures" / "in"
    src = workspace.resolve()
    if not src.is_dir():
        return True
    for original in root.rglob("*"):
        if not original.is_file():
            continue
        rel = original.relative_to(root)
        candidate = src / "in" / rel
        if candidate.is_file() and candidate.read_bytes() != original.read_bytes():
            return False
    return True


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    gt = _load_json(_GT)
    checks: list[dict[str, Any]] = []
    weights = gt["scoring"]["weights"]

    def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})

    csv_score = 0.0
    path = w / "out" / "k8s_audit.csv"
    if path.is_file():
        try:
            with path.open("r", encoding="utf-8", newline="") as fh:
                rows = list(csv.DictReader(fh))
            columns_ok = set(gt["required_columns"]).issubset(rows[0].keys() if rows else set())
            by_id = {str(row.get("check_id", "")).strip(): row for row in rows}
            per = 1.0 / len(gt["expected_findings"])
            for check_id, exp in gt["expected_findings"].items():
                row = by_id.get(check_id, {})
                text = " ".join(str(row.get(k, "")) for k in ("evidence", "recommendation", "resource"))
                keyword_hits = sum(1 for token in exp["keywords"] if token.lower() in text.lower())
                row_score = (
                    0.35 * (_norm(row.get("status")) == exp["status"])
                    + 0.25 * (_norm(row.get("severity")) == exp["severity"])
                    + 0.25 * min(keyword_hits / max(len(exp["keywords"]), 1), 1)
                    + 0.15 * bool(str(row.get("recommendation", "")).strip())
                )
                csv_score += per * row_score
            csv_score = min(1.0, 0.12 * columns_ok + 0.88 * csv_score)
            add("k8s_audit_csv", "k8s_audit.csv contains required findings", csv_score >= 0.70, weights["csv"], {"score": round(csv_score, 4), "rows": len(rows)})
        except Exception as exc:
            add("k8s_audit_parse", "k8s_audit.csv parseable", False, weights["csv"], str(exc))
    else:
        add("k8s_audit_missing", "k8s_audit.csv exists", False, weights["csv"], "missing")

    unchanged = _source_unchanged(w)
    add("source_unchanged", "fixtures are not modified", unchanged, weights["source_unchanged"])

    total = csv_score * weights["csv"] + float(unchanged) * weights["source_unchanged"]
    thresholds = gt["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "062-k8s-config-audit", "workspace": str(w), "outcome_score": round(total, 4), "level": level, "checks": checks}