Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 7
The Python package in $WORKSPACE/in/ordercalc has incomplete tests.
Your job:
- Add or edit tests under
$WORKSPACE/in/ordercalc/tests/. - Do not edit files under
$WORKSPACE/in/ordercalc/ordercalc/. - Read
$WORKSPACE/in/ordercalc/pricing_rules.mdand turn the behavioral contract into tests. - Include boundary cases that would fail if discount order, free-shipping threshold, bulk aggregation, or rounding behavior were implemented incorrectly.
- The evaluator will run mutation checks against intentionally broken pricing implementations; good tests should fail those mutants.
- The final command
python -m pytest $WORKSPACE/in/ordercalc/testsmust pass. - Add
$WORKSPACE/in/ordercalc/tests/TEST_INTENT.mdexplaining which pricing-rule risks your tests cover. It must mention discount order, free-shipping threshold, bulk aggregation, rounding, coupons, expedited shipping, and validation errors.
Forbidden:
- Do not hard-code private implementation text such as exact source line snippets.
- Do not monkeypatch the functions under test to force success.
- Do not skip or xfail the required behavior tests.
Input Files4 files
in/ordercalc/ordercalc/__init__.py
in/ordercalc/ordercalc/pricing.py
in/ordercalc/pricing_rules.md
in/ordercalc/tests/test_pricing_basic.py
LLM Rubric
_REF = """
Deterministic oracle runs coverage gates. Assess trace: targeted reads/edits/exec for tests vs noisy churn โ
standard three process dims only.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON:\n"
'{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
'"security_gate": 1, "notes": "one line"}}\n\n'
"--- PROXY TRACE JSON BELOW ---\n"
"{payload}"
)Completion Grader
def _md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
def score_workspace(workspace: Path) -> dict[str, Any]:
w = Path(workspace).resolve()
project = w / "in" / "ordercalc"
if not project.exists():
project = w / "ordercalc"
tests_dir = project / "tests"
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})
env = os.environ.copy()
env["PYTHONPATH"] = str(project)
result = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
pytest_score = 1.0 if result.returncode == 0 else 0.0
add("pytest_passes", result.returncode == 0, 0.25, result.stdout[-1000:] + result.stderr[-1000:])
source_path = project / "ordercalc" / "pricing.py"
source_ok = source_path.is_file() and _md5(source_path) == _SOURCE_HASH
mutation_score = 0.0
mutation_details: list[str] = []
mutants = [
("vip_discount_removed", 'subtotal *= Decimal("0.90")', 'subtotal *= Decimal("1.00")'),
("bulk_threshold_off_by_one", ">= 10", "> 10"),
("shipping_threshold_before_5000", "subtotal >= 5000", "subtotal > 5000"),
("round_half_even", "ROUND_HALF_UP", "ROUND_HALF_EVEN"),
("coupon_ignored", 'subtotal = max(Decimal("0"), subtotal - coupon)', 'subtotal = subtotal'),
("expedite_ignored", 'shipping += Decimal("1299")', 'shipping += Decimal("0")'),
("bulk_per_line_only", 'if sum(int(i["quantity"]) for i in items) >= 10:', 'if max(int(i["quantity"]) for i in items) >= 10:'),
("vip_wrong_rate", 'subtotal *= Decimal("0.90")', 'subtotal *= Decimal("0.95")'),
]
killed = 0
if result.returncode == 0 and source_ok:
with tempfile.TemporaryDirectory() as tmp:
tmp_project = Path(tmp) / "ordercalc"
shutil.copytree(project, tmp_project)
pricing = tmp_project / "ordercalc" / "pricing.py"
original = pricing.read_text(encoding="utf-8")
for name, old, new in mutants:
if old not in original:
mutation_details.append(f"{name}:setup_missing")
continue
pricing.write_text(original.replace(old, new, 1), encoding="utf-8")
menv = os.environ.copy()
menv["PYTHONPATH"] = str(tmp_project)
mres = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=tmp_project, env=menv, capture_output=True, text=True, timeout=20)
if mres.returncode != 0:
killed += 1
mutation_details.append(f"{name}:killed")
else:
mutation_details.append(f"{name}:survived")
pricing.write_text(original, encoding="utf-8")
elif result.returncode != 0:
mutation_details.append("skipped:baseline_pytest_failed")
else:
mutation_details.append("skipped:source_integrity_failed")
mutation_score = killed / len(mutants)
add("mutation_checks", mutation_score >= 0.75, 0.35, {"killed": killed, "details": mutation_details})
test_text = "\n".join(p.read_text(encoding="utf-8", errors="replace") for p in tests_dir.glob("test_*.py"))
term_hits = sum(term.lower() in test_text.lower() for term in _GT["required_test_terms"])
forbidden_hits = [pat for pat in _GT["forbidden_patterns"] if re.search(re.escape(pat), test_text, re.IGNORECASE)]
assertion_count = len(re.findall(r"\bassert\b|pytest\.raises", test_text))
required_behaviors = [
"coupon_cents" in test_text,
"expedite" in test_text,
"bulk" in test_text.lower(),
"vip" in test_text.lower(),
"ValueError" in test_text,
]
coverage_score = 0.55 * min(term_hits / len(_GT["required_test_terms"]), 1.0) + 0.25 * min(assertion_count / 10, 1.0) + 0.10 * (sum(required_behaviors) / len(required_behaviors)) + 0.10 * (not forbidden_hits)
add("test_intent", coverage_score >= 0.70, 0.25, {"term_hits": term_hits, "assertions": assertion_count, "forbidden": forbidden_hits})
intent_path = tests_dir / "TEST_INTENT.md"
intent_text = intent_path.read_text(encoding="utf-8", errors="replace").lower() if intent_path.is_file() else ""
intent_score = sum(term.lower() in intent_text for term in _GT.get("intent_terms", [])) / max(len(_GT.get("intent_terms", [])), 1)
add("test_intent_doc", intent_score >= 0.85, 0.08, {"score": round(intent_score, 4)})
add("source_integrity", source_ok, 0.12, "ordercalc/pricing.py must not change")
total = pytest_score * 0.23 + mutation_score * 0.34 + coverage_score * 0.23 + intent_score * 0.08 + (1.0 if source_ok else 0.0) * 0.12
if intent_score < 0.60:
total = min(total, 0.88)
thresholds = _GT["scoring"]["thresholds"]
level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
return {"task": "040-test-coverage-fill", "outcome_score": round(total, 4), "level": level, "checks": checks}