Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSRE, DevOps & Release Ops ยท Task 2
You are auditing an offline Kubernetes service configuration.
Inputs:
- $WORKSPACE/in/deployment.yaml
- $WORKSPACE/in/service.yaml
- $WORKSPACE/in/policy.md
Create exactly this output file:
- $WORKSPACE/out/k8s_audit.csv
CSV requirements:
- Header must include: check_id,resource,status,severity,evidence,recommendation
- One row per relevant finding.
- status must be one of pass, fail, warn.
- Use the exact check_id values when a finding applies:
cpu_memory_limits, readiness_probe, liveness_probe, service_selector, service_target_port, image_tag_policy, privileged_container, service_type_policy
Audit expectations:
- Check container CPU and memory requests/limits.
- Check readiness and liveness probes.
- Check Deployment labels vs Service selector.
- Check Service port/targetPort vs declared container ports.
- Apply every forbidden or required item in policy.md.
Do not modify input files. Do not call the network or use live cluster tools. Base your answer only on the supplied files.
Input Files3 files
in/deployment.yaml
in/policy.md
in/service.yaml
LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\nEvaluate whether the Kubernetes audit is evidence-based, concise, and does not invent cluster state. "
"The deterministic oracle owns factual scoring; use this rubric only for process, clarity, and safe recommendations.\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
--- PROXY TRACE JSON BELOW ---
{payload}"""
)Completion Grader
def _load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def _norm(value: Any) -> str:
return str(value or "").strip().lower()
def _source_unchanged(workspace: Path) -> bool:
root = _TASK_DIR / "fixtures" / "in"
src = workspace.resolve()
if not src.is_dir():
return True
for original in root.rglob("*"):
if not original.is_file():
continue
rel = original.relative_to(root)
candidate = src / "in" / rel
if candidate.is_file() and candidate.read_bytes() != original.read_bytes():
return False
return True
def score_workspace(workspace: Path) -> dict[str, Any]:
w = workspace.resolve()
gt = _load_json(_GT)
checks: list[dict[str, Any]] = []
weights = gt["scoring"]["weights"]
def add(cid: str, label: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "label": label, "pass": bool(ok), "weight": weight, "detail": detail})
csv_score = 0.0
path = w / "out" / "k8s_audit.csv"
if path.is_file():
try:
with path.open("r", encoding="utf-8", newline="") as fh:
rows = list(csv.DictReader(fh))
columns_ok = set(gt["required_columns"]).issubset(rows[0].keys() if rows else set())
by_id = {str(row.get("check_id", "")).strip(): row for row in rows}
per = 1.0 / len(gt["expected_findings"])
for check_id, exp in gt["expected_findings"].items():
row = by_id.get(check_id, {})
text = " ".join(str(row.get(k, "")) for k in ("evidence", "recommendation", "resource"))
keyword_hits = sum(1 for token in exp["keywords"] if token.lower() in text.lower())
row_score = (
0.35 * (_norm(row.get("status")) == exp["status"])
+ 0.25 * (_norm(row.get("severity")) == exp["severity"])
+ 0.25 * min(keyword_hits / max(len(exp["keywords"]), 1), 1)
+ 0.15 * bool(str(row.get("recommendation", "")).strip())
)
csv_score += per * row_score
csv_score = min(1.0, 0.12 * columns_ok + 0.88 * csv_score)
add("k8s_audit_csv", "k8s_audit.csv contains required findings", csv_score >= 0.70, weights["csv"], {"score": round(csv_score, 4), "rows": len(rows)})
except Exception as exc:
add("k8s_audit_parse", "k8s_audit.csv parseable", False, weights["csv"], str(exc))
else:
add("k8s_audit_missing", "k8s_audit.csv exists", False, weights["csv"], "missing")
unchanged = _source_unchanged(w)
add("source_unchanged", "fixtures are not modified", unchanged, weights["source_unchanged"])
total = csv_score * weights["csv"] + float(unchanged) * weights["source_unchanged"]
thresholds = gt["scoring"]["thresholds"]
level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
return {"task": "062-k8s-config-audit", "workspace": str(w), "outcome_score": round(total, 4), "level": level, "checks": checks}