Repair Cross-Package Interface Break in a Python Monorepo

The monorepo at $WORKSPACE/in/shopmono has a cross-package interface regression.

Software Engineering & Codebase MaintenanceTask 17Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 17

The monorepo at $WORKSPACE/in/shopmono has a cross-package interface regression.

catalog recently migrated product prices from the old integer price_cents field to a Money(amount_cents, currency) object. The orders and reports packages still contain old assumptions and the integration tests fail.

Your job:

  • Run python -m pytest tests from $WORKSPACE/in/shopmono.
  • Repair the cross-package compatibility issue while preserving the new catalog model.
  • Keep support for legacy product dictionaries that still contain price_cents.
  • Preserve currency validation: mixed currencies in one order must raise a clear error.
  • Write $WORKSPACE/out/interface_fix_report.md with the root cause, changed files, and verification command.

Constraints:

  • Do not modify files under $WORKSPACE/in/shopmono/tests.
  • Do not hard-code fixture SKUs, order ids, or customer names.
  • Do not remove the Money model or flatten all prices back into bare integers.
  • Do not ignore currency values or silently coerce every currency to USD.
  • Do not add third-party dependencies.
Input Files12 files
in/shopmono/docs/interface_change.md
in/shopmono/packages/catalog/catalog/__init__.py
in/shopmono/packages/catalog/catalog/models.py
in/shopmono/packages/catalog/catalog/pricing.py
in/shopmono/packages/orders/orders/__init__.py
in/shopmono/packages/orders/orders/adapter.py
in/shopmono/packages/orders/orders/service.py
in/shopmono/packages/reports/reports/__init__.py
in/shopmono/packages/reports/reports/monthly.py
in/shopmono/pyproject.toml
in/shopmono/tests/test_catalog_contract.py
in/shopmono/tests/test_orders_report_integration.py
LLM Rubric
USER_TEMPLATE = """This task: Repair Cross-Package Interface Break in a Python Monorepo. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.

Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.

Return ONLY JSON with scores, security_gate, and notes.
"""
Completion Grader
def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def score_workspace(workspace: Path) -> dict[str, Any]:
    project = Path(workspace).resolve() / "in" / "shopmono"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    env = os.environ.copy()
    env["PYTHONPATH"] = ":".join([
        str(project / "packages" / "catalog"),
        str(project / "packages" / "orders"),
        str(project / "packages" / "reports"),
    ])
    result = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
    pytest_score = 1.0 if result.returncode == 0 else 0.0
    add("pytest", result.returncode == 0, 0.30, result.stdout[-1000:] + result.stderr[-1000:])

    hidden_score = 0.0
    try:
        script = """
from catalog.models import Money, Product
from orders.adapter import currency, price_cents
from orders.service import price_order
from reports.monthly import summarize_orders

catalog = {
    "A": Product("A", "A", Money(101, "USD")),
    "B": {"sku": "B", "name": "B", "price_cents": "205", "currency": "USD"},
    "C": Product("C", "C", Money(333, "CAD")),
}
assert price_cents(catalog["A"]) == 101
assert currency(catalog["A"]) == "USD"
assert price_order([{"sku": "A", "quantity": "2"}, {"sku": "B", "quantity": 1}], catalog) == {"total_cents": 407, "currency": "USD"}
try:
    price_order([{"sku": "A", "quantity": 1}, {"sku": "C", "quantity": 1}], catalog)
except ValueError as exc:
    assert "mixed" in str(exc).lower()
else:
    raise AssertionError("mixed currency not rejected")
assert summarize_orders([{"lines": [{"sku": "C", "quantity": 2}]}], catalog)["revenue_by_currency"] == {"CAD": 666}
"""
        hidden = subprocess.run(["python3", "-c", script], cwd=project, env=env, capture_output=True, text=True, timeout=20)
        hidden_score = 1.0 if hidden.returncode == 0 else 0.0
        add("hidden_interface_behavior", hidden.returncode == 0, 0.40, hidden.stdout[-500:] + hidden.stderr[-500:])
    except Exception as exc:
        add("hidden_interface_behavior", False, 0.40, str(exc))

    hashes_ok = []
    for rel, digest in TEST_HASHES.items():
        path = project / rel
        hashes_ok.append(path.is_file() and _md5(path) == digest)
    integrity = sum(hashes_ok) / len(hashes_ok)
    add("test_integrity", integrity == 1.0, 0.15, {"score": integrity})

    source = "\n".join((project / rel).read_text(encoding="utf-8", errors="replace") for rel in [
        "packages/orders/orders/adapter.py",
        "packages/orders/orders/service.py",
        "packages/reports/reports/monthly.py",
    ])
    quality_terms = ["amount_cents", "currency", "price_cents", "mixed"]
    forbidden = ["sku ==", "PEN", "MUG"]
    quality = 0.8 * (sum(term in source for term in quality_terms) / len(quality_terms)) + 0.2 * (not any(term in source for term in forbidden))
    add("implementation_quality", quality >= 0.75, 0.10, {"score": round(quality, 4)})

    report = Path(workspace).resolve() / "out" / "interface_fix_report.md"
    text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
    report_score = sum(term in text for term in ["money", "legacy", "currency", "pytest"]) / 4
    add("fix_report", report_score >= 0.75, 0.05, {"score": report_score})

    total = pytest_score * 0.30 + hidden_score * 0.40 + integrity * 0.15 + quality * 0.10 + report_score * 0.05
    if hidden_score < 1.0:
        total = min(total, 0.68)
    if integrity < 1.0:
        total = min(total, 0.60)
    level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
    return {"task": "083-monorepo-interface-repair", "outcome_score": round(total, 4), "level": level, "checks": checks}