Performance Regression Repair

The implementation in $WORKSPACE/in/perfcase/slow.py is correct but too slow for repeated catalog lookups.

Software Engineering & Codebase MaintenanceTask 13Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 13

The implementation in $WORKSPACE/in/perfcase/slow.py is correct but too slow for repeated catalog lookups.

Your job:

  • Optimize $WORKSPACE/in/perfcase/slow.py.
  • Preserve public function names and results.
  • Preserve the input order of matching order ids.
  • Treat missing or empty lines as no matches.
  • If an order contains the same SKU multiple times, return that order id only once for that SKU.
  • Support arbitrary SKU strings, not only the sample SKU prefix used in the benchmark note.
  • Do not edit $WORKSPACE/in/perfcase/benchmark_expected.json.
  • Do not add external dependencies, caching files, subprocess calls, databases, or network calls.

The oracle checks correctness on generated and edge-case data, verifies that fixtures are not modified, and enforces a local time threshold.

Input Files2 files
in/perfcase/benchmark_expected.json
in/perfcase/slow.py
LLM Rubric
_REF = """
Bench numbers from oracle/profiler; qualitative: hypotheses tied to deltas, profiler choice, iterative narrowing.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)
Completion Grader
def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def _orders(n: int = 30000) -> list[dict[str, Any]]:
    orders = []
    for i in range(n):
        lines = []
        for j in range(5):
            lines.append({"sku": f"SKU-{(i + j) % 100:03d}", "qty": (i + j) % 4 + 1})
        orders.append({"id": f"O-{i:05d}", "lines": lines})
    return orders


def _edge_orders() -> list[dict[str, Any]]:
    return [
        {"id": "EDGE-001", "lines": [{"sku": "SKU-EDGE", "qty": 1}, {"sku": "SKU-EDGE", "qty": 3}]},
        {"id": "EDGE-002", "lines": []},
        {"id": "EDGE-003"},
        {"id": "EDGE-004", "lines": [{"sku": "odd sku/with spaces", "qty": 1}]},
        {"id": "EDGE-005", "lines": [{"sku": "SKU-EDGE", "qty": 2}, {"sku": "OTHER", "qty": 9}]},
    ]


def _expected(orders: list[dict[str, Any]], sku: str) -> list[str]:
    return [order["id"] for order in orders if any(line.get("sku") == sku for line in order.get("lines", []))]


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    project = w / "in" / "perfcase"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    correctness = 0.0
    perf_score = 0.0
    edge_score = 0.0
    try:
        spec = importlib.util.spec_from_file_location("slow_under_test", project / "slow.py")
        assert spec and spec.loader
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)

        edge_index = mod.build_catalog_index(_edge_orders())
        edge_outputs = {
            "duplicate_once": mod.find_orders_for_sku(edge_index, "SKU-EDGE"),
            "arbitrary_sku": mod.find_orders_for_sku(edge_index, "odd sku/with spaces"),
            "missing": mod.find_orders_for_sku(edge_index, "NOT-PRESENT"),
        }
        edge_expected = {
            "duplicate_once": ["EDGE-001", "EDGE-005"],
            "arbitrary_sku": ["EDGE-004"],
            "missing": [],
        }
        edge_score = 1.0 if edge_outputs == edge_expected else 0.0
        add("edge_semantics", edge_score == 1.0, 0.20, {"got": edge_outputs})

        data = _orders()
        start = time.perf_counter()
        index = mod.build_catalog_index(data)
        outputs = [mod.find_orders_for_sku(index, f"SKU-{i:03d}") for i in range(100)]
        elapsed = time.perf_counter() - start
        expected_outputs = [_expected(data, f"SKU-{i:03d}") for i in range(100)]
        correctness = 1.0 if outputs == expected_outputs else 0.0
        perf_score = 1.0 if elapsed <= _GT["max_seconds"] else 0.0
        add("generated_correctness", correctness == 1.0, 0.30)
        add("performance", elapsed <= _GT["max_seconds"], 0.30, {"elapsed": round(elapsed, 6), "max": _GT["max_seconds"]})
    except Exception as exc:
        add("edge_semantics", False, 0.20, str(exc))
        add("generated_correctness", False, 0.30, str(exc))
        add("performance", False, 0.30, str(exc))

    source = (project / "slow.py").read_text(encoding="utf-8", errors="replace")
    function_hits = sum(name in source for name in _GT["required_functions"])
    forbidden = [term for term in _GT["forbidden_terms"] if term in source]
    expected_ok = _md5(project / "benchmark_expected.json") == _EXPECTED_HASH
    index_hint = any(term in source for term in ("dict", "defaultdict", "setdefault", "{}"))
    shape_score = 0.40 * min(function_hits / len(_GT["required_functions"]), 1) + 0.30 * index_hint + 0.30 * (not forbidden and expected_ok)
    add("implementation_shape", shape_score >= 0.70, 0.20, {"function_hits": function_hits, "forbidden": forbidden, "expected_ok": expected_ok})
    total = edge_score * 0.20 + correctness * 0.30 + perf_score * 0.30 + shape_score * 0.20
    if not perf_score:
        total = min(total, 0.59)
    if not expected_ok:
        total = min(total, 0.59)
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "046-performance-regression", "outcome_score": round(total, 4), "level": level, "checks": checks}