Performance Regression Repair

The implementation in $WORKSPACE/in/perfcase/slow.py is correct but too slow for repeated catalog lookups.

Software Engineering & Codebase MaintenanceTask 13Oracle + LLM scoring

Task ID046-performance-regression

DifficultyHard

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptSoftware Engineering & Codebase Maintenance · Task 13

The implementation in $WORKSPACE/in/perfcase/slow.py is correct but too slow for repeated catalog lookups.

Your job:

Optimize $WORKSPACE/in/perfcase/slow.py.
Preserve public function names and results.
Preserve the input order of matching order ids.
Treat missing or empty lines as no matches.
If an order contains the same SKU multiple times, return that order id only once for that SKU.
Support arbitrary SKU strings, not only the sample SKU prefix used in the benchmark note.
Do not edit $WORKSPACE/in/perfcase/benchmark_expected.json.
Do not add external dependencies, caching files, subprocess calls, databases, or network calls.

The oracle checks correctness on generated and edge-case data, verifies that fixtures are not modified, and enforces a local time threshold.

Input Files2 files

in/perfcase/benchmark_expected.json

in/perfcase/slow.py

LLM Rubric

_REF = """
Bench numbers from oracle/profiler; qualitative: hypotheses tied to deltas, profiler choice, iterative narrowing.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def _orders(n: int = 30000) -> list[dict[str, Any]]:
    orders = []
    for i in range(n):
        lines = []
        for j in range(5):
            lines.append({"sku": f"SKU-{(i + j) % 100:03d}", "qty": (i + j) % 4 + 1})
        orders.append({"id": f"O-{i:05d}", "lines": lines})
    return orders


def _edge_orders() -> list[dict[str, Any]]:
    return [
        {"id": "EDGE-001", "lines": [{"sku": "SKU-EDGE", "qty": 1}, {"sku": "SKU-EDGE", "qty": 3}]},
        {"id": "EDGE-002", "lines": []},
        {"id": "EDGE-003"},
        {"id": "EDGE-004", "lines": [{"sku": "odd sku/with spaces", "qty": 1}]},
        {"id": "EDGE-005", "lines": [{"sku": "SKU-EDGE", "qty": 2}, {"sku": "OTHER", "qty": 9}]},
    ]


def _expected(orders: list[dict[str, Any]], sku: str) -> list[str]:
    return [order["id"] for order in orders if any(line.get("sku") == sku for line in order.get("lines", []))]


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    project = w / "in" / "perfcase"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    correctness = 0.0
    perf_score = 0.0
    edge_score = 0.0
    try:
        spec = importlib.util.spec_from_file_location("slow_under_test", project / "slow.py")
        assert spec and spec.loader
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)

        edge_index = mod.build_catalog_index(_edge_orders())
        edge_outputs = {
            "duplicate_once": mod.find_orders_for_sku(edge_index, "SKU-EDGE"),
            "arbitrary_sku": mod.find_orders_for_sku(edge_index, "odd sku/with spaces"),
            "missing": mod.find_orders_for_sku(edge_index, "NOT-PRESENT"),
        }
        edge_expected = {
            "duplicate_once": ["EDGE-001", "EDGE-005"],
            "arbitrary_sku": ["EDGE-004"],
            "missing": [],
        }
        edge_score = 1.0 if edge_outputs == edge_expected else 0.0
        add("edge_semantics", edge_score == 1.0, 0.20, {"got": edge_outputs})

        data = _orders()
        start = time.perf_counter()
        index = mod.build_catalog_index(data)
        outputs = [mod.find_orders_for_sku(index, f"SKU-{i:03d}") for i in range(100)]
        elapsed = time.perf_counter() - start
        expected_outputs = [_expected(data, f"SKU-{i:03d}") for i in range(100)]
        correctness = 1.0 if outputs == expected_outputs else 0.0
        perf_score = 1.0 if elapsed <= _GT["max_seconds"] else 0.0
        add("generated_correctness", correctness == 1.0, 0.30)
        add("performance", elapsed <= _GT["max_seconds"], 0.30, {"elapsed": round(elapsed, 6), "max": _GT["max_seconds"]})
    except Exception as exc:
        add("edge_semantics", False, 0.20, str(exc))
        add("generated_correctness", False, 0.30, str(exc))
        add("performance", False, 0.30, str(exc))

    source = (project / "slow.py").read_text(encoding="utf-8", errors="replace")
    function_hits = sum(name in source for name in _GT["required_functions"])
    forbidden = [term for term in _GT["forbidden_terms"] if term in source]
    expected_ok = _md5(project / "benchmark_expected.json") == _EXPECTED_HASH
    index_hint = any(term in source for term in ("dict", "defaultdict", "setdefault", "{}"))
    shape_score = 0.40 * min(function_hits / len(_GT["required_functions"]), 1) + 0.30 * index_hint + 0.30 * (not forbidden and expected_ok)
    add("implementation_shape", shape_score >= 0.70, 0.20, {"function_hits": function_hits, "forbidden": forbidden, "expected_ok": expected_ok})
    total = edge_score * 0.20 + correctness * 0.30 + perf_score * 0.30 + shape_score * 0.20
    if not perf_score:
        total = min(total, 0.59)
    if not expected_ok:
        total = min(total, 0.59)
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "046-performance-regression", "outcome_score": round(total, 4), "level": level, "checks": checks}

← Task 12 Task 14 →