Fill Missing Unit Test Coverage

The Python package in $WORKSPACE/in/ordercalc has incomplete tests.

Software Engineering & Codebase MaintenanceTask 7Oracle + LLM scoring

Task ID040-test-coverage-fill

DifficultyMedium

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptSoftware Engineering & Codebase Maintenance · Task 7

The Python package in $WORKSPACE/in/ordercalc has incomplete tests.

Your job:

Add or edit tests under $WORKSPACE/in/ordercalc/tests/.
Do not edit files under $WORKSPACE/in/ordercalc/ordercalc/.
Read $WORKSPACE/in/ordercalc/pricing_rules.md and turn the behavioral contract into tests.
Include boundary cases that would fail if discount order, free-shipping threshold, bulk aggregation, or rounding behavior were implemented incorrectly.
The evaluator will run mutation checks against intentionally broken pricing implementations; good tests should fail those mutants.
The final command python -m pytest $WORKSPACE/in/ordercalc/tests must pass.
Add $WORKSPACE/in/ordercalc/tests/TEST_INTENT.md explaining which pricing-rule risks your tests cover. It must mention discount order, free-shipping threshold, bulk aggregation, rounding, coupons, expedited shipping, and validation errors.

Forbidden:

Do not hard-code private implementation text such as exact source line snippets.
Do not monkeypatch the functions under test to force success.
Do not skip or xfail the required behavior tests.

Input Files4 files

in/ordercalc/ordercalc/__init__.py

in/ordercalc/ordercalc/pricing.py

in/ordercalc/pricing_rules.md

in/ordercalc/tests/test_pricing_basic.py

LLM Rubric

_REF = """
Deterministic oracle runs coverage gates. Assess trace: targeted reads/edits/exec for tests vs noisy churn —
standard three process dims only.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    project = w / "in" / "ordercalc"
    if not project.exists():
        project = w / "ordercalc"
    tests_dir = project / "tests"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    env = os.environ.copy()
    env["PYTHONPATH"] = str(project)
    result = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
    pytest_score = 1.0 if result.returncode == 0 else 0.0
    add("pytest_passes", result.returncode == 0, 0.25, result.stdout[-1000:] + result.stderr[-1000:])

    source_path = project / "ordercalc" / "pricing.py"
    source_ok = source_path.is_file() and _md5(source_path) == _SOURCE_HASH

    mutation_score = 0.0
    mutation_details: list[str] = []
    mutants = [
        ("vip_discount_removed", 'subtotal *= Decimal("0.90")', 'subtotal *= Decimal("1.00")'),
        ("bulk_threshold_off_by_one", ">= 10", "> 10"),
        ("shipping_threshold_before_5000", "subtotal >= 5000", "subtotal > 5000"),
        ("round_half_even", "ROUND_HALF_UP", "ROUND_HALF_EVEN"),
        ("coupon_ignored", 'subtotal = max(Decimal("0"), subtotal - coupon)', 'subtotal = subtotal'),
        ("expedite_ignored", 'shipping += Decimal("1299")', 'shipping += Decimal("0")'),
        ("bulk_per_line_only", 'if sum(int(i["quantity"]) for i in items) >= 10:', 'if max(int(i["quantity"]) for i in items) >= 10:'),
        ("vip_wrong_rate", 'subtotal *= Decimal("0.90")', 'subtotal *= Decimal("0.95")'),
    ]
    killed = 0
    if result.returncode == 0 and source_ok:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_project = Path(tmp) / "ordercalc"
            shutil.copytree(project, tmp_project)
            pricing = tmp_project / "ordercalc" / "pricing.py"
            original = pricing.read_text(encoding="utf-8")
            for name, old, new in mutants:
                if old not in original:
                    mutation_details.append(f"{name}:setup_missing")
                    continue
                pricing.write_text(original.replace(old, new, 1), encoding="utf-8")
                menv = os.environ.copy()
                menv["PYTHONPATH"] = str(tmp_project)
                mres = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=tmp_project, env=menv, capture_output=True, text=True, timeout=20)
                if mres.returncode != 0:
                    killed += 1
                    mutation_details.append(f"{name}:killed")
                else:
                    mutation_details.append(f"{name}:survived")
                pricing.write_text(original, encoding="utf-8")
    elif result.returncode != 0:
        mutation_details.append("skipped:baseline_pytest_failed")
    else:
        mutation_details.append("skipped:source_integrity_failed")
    mutation_score = killed / len(mutants)
    add("mutation_checks", mutation_score >= 0.75, 0.35, {"killed": killed, "details": mutation_details})

    test_text = "\n".join(p.read_text(encoding="utf-8", errors="replace") for p in tests_dir.glob("test_*.py"))
    term_hits = sum(term.lower() in test_text.lower() for term in _GT["required_test_terms"])
    forbidden_hits = [pat for pat in _GT["forbidden_patterns"] if re.search(re.escape(pat), test_text, re.IGNORECASE)]
    assertion_count = len(re.findall(r"\bassert\b|pytest\.raises", test_text))
    required_behaviors = [
        "coupon_cents" in test_text,
        "expedite" in test_text,
        "bulk" in test_text.lower(),
        "vip" in test_text.lower(),
        "ValueError" in test_text,
    ]
    coverage_score = 0.55 * min(term_hits / len(_GT["required_test_terms"]), 1.0) + 0.25 * min(assertion_count / 10, 1.0) + 0.10 * (sum(required_behaviors) / len(required_behaviors)) + 0.10 * (not forbidden_hits)
    add("test_intent", coverage_score >= 0.70, 0.25, {"term_hits": term_hits, "assertions": assertion_count, "forbidden": forbidden_hits})

    intent_path = tests_dir / "TEST_INTENT.md"
    intent_text = intent_path.read_text(encoding="utf-8", errors="replace").lower() if intent_path.is_file() else ""
    intent_score = sum(term.lower() in intent_text for term in _GT.get("intent_terms", [])) / max(len(_GT.get("intent_terms", [])), 1)
    add("test_intent_doc", intent_score >= 0.85, 0.08, {"score": round(intent_score, 4)})

    add("source_integrity", source_ok, 0.12, "ordercalc/pricing.py must not change")
    total = pytest_score * 0.23 + mutation_score * 0.34 + coverage_score * 0.23 + intent_score * 0.08 + (1.0 if source_ok else 0.0) * 0.12
    if intent_score < 0.60:
        total = min(total, 0.88)
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "040-test-coverage-fill", "outcome_score": round(total, 4), "level": level, "checks": checks}

← Task 6 Task 8 →