Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 19
The pytest suite in $WORKSPACE/in/flakyqueue is flaky. The failures are symptoms of production code nondeterminism, not bad tests.
Your job:
- Run
python -m pytest testsfrom$WORKSPACE/in/flakyqueue. - Identify and fix the root cause in the implementation.
- Make task ordering and retry scheduling deterministic when a clock/random source is injected.
- Write
$WORKSPACE/out/root_cause.mddescribing the flaky behavior, the root cause, and the verification you ran.
Constraints:
- Do not modify files under
$WORKSPACE/in/flakyqueue/tests. - Do not skip, xfail, or weaken tests.
- Do not add long sleeps or retry loops around pytest.
- Do not delete retry jitter; make it injectable/deterministic.
- Do not hard-code fixture task ids.
- Do not add third-party dependencies.
Input Files8 files
in/flakyqueue/README.md
in/flakyqueue/flakyqueue/__init__.py
in/flakyqueue/flakyqueue/clock.py
in/flakyqueue/flakyqueue/scheduler.py
in/flakyqueue/flakyqueue/store.py
in/flakyqueue/pyproject.toml
in/flakyqueue/tests/test_retry_order.py
in/flakyqueue/tests/test_scheduler.py
LLM Rubric
USER_TEMPLATE = """This task: Stabilize a Flaky Test by Fixing the Root Cause. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.
Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def _md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
def score_workspace(workspace: Path) -> dict[str, Any]:
project = Path(workspace).resolve() / "in" / "flakyqueue"
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})
env = os.environ.copy()
env["PYTHONPATH"] = str(project)
runs = []
for _ in range(8):
proc = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
runs.append(proc.returncode)
if proc.returncode != 0:
last_output = proc.stdout[-800:] + proc.stderr[-800:]
break
else:
last_output = ""
stable_score = 1.0 if all(code == 0 for code in runs) else 0.0
add("repeated_pytest_stable", stable_score == 1.0, 0.35, {"returncodes": runs, "last_output": last_output})
hidden_score = 0.0
script = """
from flakyqueue.scheduler import Scheduler
class Clock:
def __init__(self): self.value = 10.0
def now(self): return self.value
class Rand:
def __init__(self): self.values = [0.75, 0.125]
def random(self): return self.values.pop(0)
clock = Clock()
s = Scheduler(clock=clock, random_source=Rand())
s.add("z", 2)
s.add("a", 2)
s.add("m", 5)
assert [t.id for t in s.ready()] == ["m", "a", "z"]
t = s.schedule_retry(s.ready()[0], base_delay=3)
assert t.run_at == 13.75
clock.value = 13.74
assert [x.id for x in s.ready()] == ["a", "z"]
clock.value = 13.75
assert [x.id for x in s.ready()] == ["m", "a", "z"]
"""
hidden = subprocess.run(["python3", "-c", script], cwd=project, env=env, capture_output=True, text=True, timeout=20)
hidden_score = 1.0 if hidden.returncode == 0 else 0.0
add("hidden_determinism", hidden_score == 1.0, 0.35, hidden.stdout[-500:] + hidden.stderr[-500:])
integrity_items = []
for rel, digest in TEST_HASHES.items():
path = project / rel
integrity_items.append(path.is_file() and _md5(path) == digest)
integrity = sum(integrity_items) / len(integrity_items)
add("test_integrity", integrity == 1.0, 0.15, {"score": integrity})
source = (project / "flakyqueue" / "scheduler.py").read_text(encoding="utf-8", errors="replace")
bad_terms = ["pytest.skip", "xfail", "sleep(", "time.time()"]
quality = 0.55 * (not any(term in source for term in bad_terms)) + 0.45 * (sum(term in source for term in ["clock.now", "random_source", "created_at", "id"]) / 4)
add("implementation_quality", quality >= 0.75, 0.10, {"score": round(quality, 4)})
report = Path(workspace).resolve() / "out" / "root_cause.md"
text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
report_score = sum(term in text for term in ["clock", "order", "jitter", "pytest"]) / 4
add("root_cause_report", report_score >= 0.75, 0.05, {"score": report_score})
total = stable_score * 0.35 + hidden_score * 0.35 + integrity * 0.15 + quality * 0.10 + report_score * 0.05
if stable_score < 1.0 or hidden_score < 1.0:
total = min(total, 0.62)
if integrity < 1.0:
total = min(total, 0.55)
level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
return {"task": "085-flaky-test-root-cause", "outcome_score": round(total, 4), "level": level, "checks": checks}