Code Review Risk Report

Review the proposed change using $WORKSPACE/in/review/diff.patch and the snippets under $WORKSPACE/in/review/snippets.

Software Engineering & Codebase MaintenanceTask 14Oracle + LLM scoring

Task ID047-code-review-risk-report

DifficultyMedium

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptSoftware Engineering & Codebase Maintenance · Task 14

Review the proposed change using $WORKSPACE/in/review/diff.patch and the snippets under $WORKSPACE/in/review/snippets.

Create $WORKSPACE/out/review_findings.json with this shape:

{ "findings": [ { "id": "short-stable-id", "severity": "critical|high|medium|low", "file": "relative/path", "line_hint": "string or number", "risk": "what can go wrong", "recommendation": "what should change", "test": "specific regression or security test to add", "evidence": "quote or concise reference" } ], "summary": "brief review summary" }

Do not modify fixture files. Focus on concrete correctness/security/regression risks, not style-only comments. Only report evidence-supported issues. Do not flag harmless naming/style changes or the existing parameterized helper snippets as risks unless the diff makes them risky. Use critical only for direct auth bypass or cross-tenant data exposure. Use high for injection, payment duplication, sensitive data leakage, or cross-user cache leakage. Use medium for data-inclusion regressions without direct privilege escalation.

Input Files4 files

in/review/diff.patch

in/review/snippets/api.py

in/review/snippets/db.py

in/review/snippets/logging.py

LLM Rubric

_REF = """
Diff coverage + severity rubric oracle; qualitative: cites concrete lines/tests, distinguishes severity from style noise.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    shape_score = 0.0
    finding_score = 0.0
    action_score = 0.0
    path = w / "out" / "review_findings.json"
    if path.is_file():
        try:
            data = json.loads(path.read_text(encoding="utf-8"))
            findings = data.get("findings", [])
            shape_score = 1.0 if isinstance(findings, list) and len(findings) >= 8 and isinstance(data.get("summary", ""), str) else 0.0
            add("json_shape", shape_score == 1.0, 0.18, {"findings": len(findings) if isinstance(findings, list) else None})
            finding_hits: dict[str, float] = {}
            used_indices: set[int] = set()
            for expected in _GT["expected_findings"]:
                best = 0.0
                best_idx = -1
                for idx, finding in enumerate(findings if isinstance(findings, list) else []):
                    if idx in used_indices:
                        continue
                    if not isinstance(finding, dict):
                        continue
                    text = json.dumps(finding, ensure_ascii=False).lower()
                    file_ok = str(finding.get("file", "")).lower() == expected["file"].lower()
                    term_score = sum(term.lower() in text for term in expected["terms"]) / len(expected["terms"])
                    rec = str(finding.get("recommendation", "")).lower()
                    rec_score = min(sum(term.lower() in rec for term in expected["recommendation_terms"]) / 2, 1.0)
                    severity_ok = str(finding.get("severity", "")).lower() in {"critical", "high", "medium"}
                    score = 0.35 * file_ok + 0.35 * term_score + 0.20 * rec_score + 0.10 * severity_ok
                    if score > best:
                        best = score
                        best_idx = idx
                if best_idx >= 0 and best >= 0.75:
                    used_indices.add(best_idx)
                finding_hits[expected["id"]] = round(best, 3)
            all_text = json.dumps(findings, ensure_ascii=False).lower()
            false_positive_ok = not any(term in all_text for term in _GT.get("false_positive_terms", []))
            finding_score = 0.90 * (sum(finding_hits.values()) / len(finding_hits)) + 0.10 * false_positive_ok
            add("finding_coverage", finding_score >= 0.75, 0.56, {"finding_hits": finding_hits})
            severity_hits = 0
            test_hits = 0
            for expected in _GT["expected_findings"]:
                for finding in findings if isinstance(findings, list) else []:
                    if not isinstance(finding, dict):
                        continue
                    text = json.dumps(finding, ensure_ascii=False).lower()
                    if str(finding.get("file", "")).lower() != expected["file"].lower():
                        continue
                    if sum(term.lower() in text for term in expected["terms"]) < 2:
                        continue
                    if str(finding.get("severity", "")).lower() == expected.get("severity", "").lower():
                        severity_hits += 1
                    test_text = str(finding.get("test", "")).lower()
                    if test_text and sum(term.lower() in test_text for term in expected.get("test_terms", [])) >= 2:
                        test_hits += 1
                    break
            expected_count = len(_GT["expected_findings"])
            add("severity_exactness", severity_hits >= expected_count, 0.10, {"severity_hits": severity_hits, "expected": expected_count})
            add("test_coverage", test_hits >= expected_count, 0.18, {"test_hits": test_hits, "expected": expected_count})
            rec_hits = sum(
                bool(str(item.get("recommendation", "")).strip())
                and bool(str(item.get("evidence", "")).strip())
                and str(item.get("severity", "")).lower() in {"critical", "high", "medium", "low"}
                for item in findings
                if isinstance(item, dict)
            )
            action_score = min(rec_hits / 8, 1.0)
            add("actionability", action_score >= 0.75, 0.16, {"recommendation_evidence_hits": rec_hits})
        except Exception as exc:
            add("json_shape", False, 0.18, str(exc))
    else:
        add("json_shape", False, 0.18, "missing out/review_findings.json")

    base = w / "in" / "review"
    intact = [(_md5(base / rel) == digest) for rel, digest in _HASHES.items()]
    integrity = sum(intact) / len(intact)
    add("fixture_integrity", integrity == 1.0, 0.10, {"score": integrity})
    severity_score = 1.0 if any(c["id"] == "severity_exactness" and c["pass"] for c in checks) else 0.0
    test_score = 1.0 if any(c["id"] == "test_coverage" and c["pass"] for c in checks) else 0.0
    total = shape_score * 0.10 + finding_score * 0.35 + action_score * 0.10 + integrity * 0.10 + severity_score * 0.10 + test_score * 0.25
    strong_findings = sum(1 for score in locals().get("finding_hits", {}).values() if score >= 0.75)
    if strong_findings < 6:
        total = min(total, 0.59)
    elif strong_findings < 8:
        total = min(total, 0.74)
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "047-code-review-risk-report", "outcome_score": round(total, 4), "level": level, "checks": checks}

← Task 13 Task 15 →