Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 17
The monorepo at $WORKSPACE/in/shopmono has a cross-package interface regression.
catalog recently migrated product prices from the old integer price_cents field to a Money(amount_cents, currency) object. The orders and reports packages still contain old assumptions and the integration tests fail.
Your job:
- Run
python -m pytest testsfrom$WORKSPACE/in/shopmono. - Repair the cross-package compatibility issue while preserving the new
catalogmodel. - Keep support for legacy product dictionaries that still contain
price_cents. - Preserve currency validation: mixed currencies in one order must raise a clear error.
- Write
$WORKSPACE/out/interface_fix_report.mdwith the root cause, changed files, and verification command.
Constraints:
- Do not modify files under
$WORKSPACE/in/shopmono/tests. - Do not hard-code fixture SKUs, order ids, or customer names.
- Do not remove the
Moneymodel or flatten all prices back into bare integers. - Do not ignore currency values or silently coerce every currency to USD.
- Do not add third-party dependencies.
Input Files12 files
in/shopmono/docs/interface_change.md
in/shopmono/packages/catalog/catalog/__init__.py
in/shopmono/packages/catalog/catalog/models.py
in/shopmono/packages/catalog/catalog/pricing.py
in/shopmono/packages/orders/orders/__init__.py
in/shopmono/packages/orders/orders/adapter.py
in/shopmono/packages/orders/orders/service.py
in/shopmono/packages/reports/reports/__init__.py
in/shopmono/packages/reports/reports/monthly.py
in/shopmono/pyproject.toml
in/shopmono/tests/test_catalog_contract.py
in/shopmono/tests/test_orders_report_integration.py
LLM Rubric
USER_TEMPLATE = """This task: Repair Cross-Package Interface Break in a Python Monorepo. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.
Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def _md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
def score_workspace(workspace: Path) -> dict[str, Any]:
project = Path(workspace).resolve() / "in" / "shopmono"
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})
env = os.environ.copy()
env["PYTHONPATH"] = ":".join([
str(project / "packages" / "catalog"),
str(project / "packages" / "orders"),
str(project / "packages" / "reports"),
])
result = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
pytest_score = 1.0 if result.returncode == 0 else 0.0
add("pytest", result.returncode == 0, 0.30, result.stdout[-1000:] + result.stderr[-1000:])
hidden_score = 0.0
try:
script = """
from catalog.models import Money, Product
from orders.adapter import currency, price_cents
from orders.service import price_order
from reports.monthly import summarize_orders
catalog = {
"A": Product("A", "A", Money(101, "USD")),
"B": {"sku": "B", "name": "B", "price_cents": "205", "currency": "USD"},
"C": Product("C", "C", Money(333, "CAD")),
}
assert price_cents(catalog["A"]) == 101
assert currency(catalog["A"]) == "USD"
assert price_order([{"sku": "A", "quantity": "2"}, {"sku": "B", "quantity": 1}], catalog) == {"total_cents": 407, "currency": "USD"}
try:
price_order([{"sku": "A", "quantity": 1}, {"sku": "C", "quantity": 1}], catalog)
except ValueError as exc:
assert "mixed" in str(exc).lower()
else:
raise AssertionError("mixed currency not rejected")
assert summarize_orders([{"lines": [{"sku": "C", "quantity": 2}]}], catalog)["revenue_by_currency"] == {"CAD": 666}
"""
hidden = subprocess.run(["python3", "-c", script], cwd=project, env=env, capture_output=True, text=True, timeout=20)
hidden_score = 1.0 if hidden.returncode == 0 else 0.0
add("hidden_interface_behavior", hidden.returncode == 0, 0.40, hidden.stdout[-500:] + hidden.stderr[-500:])
except Exception as exc:
add("hidden_interface_behavior", False, 0.40, str(exc))
hashes_ok = []
for rel, digest in TEST_HASHES.items():
path = project / rel
hashes_ok.append(path.is_file() and _md5(path) == digest)
integrity = sum(hashes_ok) / len(hashes_ok)
add("test_integrity", integrity == 1.0, 0.15, {"score": integrity})
source = "\n".join((project / rel).read_text(encoding="utf-8", errors="replace") for rel in [
"packages/orders/orders/adapter.py",
"packages/orders/orders/service.py",
"packages/reports/reports/monthly.py",
])
quality_terms = ["amount_cents", "currency", "price_cents", "mixed"]
forbidden = ["sku ==", "PEN", "MUG"]
quality = 0.8 * (sum(term in source for term in quality_terms) / len(quality_terms)) + 0.2 * (not any(term in source for term in forbidden))
add("implementation_quality", quality >= 0.75, 0.10, {"score": round(quality, 4)})
report = Path(workspace).resolve() / "out" / "interface_fix_report.md"
text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
report_score = sum(term in text for term in ["money", "legacy", "currency", "pytest"]) / 4
add("fix_report", report_score >= 0.75, 0.05, {"score": report_score})
total = pytest_score * 0.30 + hidden_score * 0.40 + integrity * 0.15 + quality * 0.10 + report_score * 0.05
if hidden_score < 1.0:
total = min(total, 0.68)
if integrity < 1.0:
total = min(total, 0.60)
level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
return {"task": "083-monorepo-interface-repair", "outcome_score": round(total, 4), "level": level, "checks": checks}