Dependency Upgrade Compatibility

The fixture package in $WORKSPACE/in/depsvc must be updated for a small dependency compatibility requirement.

Software Engineering & Codebase MaintenanceTask 12Oracle + LLM scoring

Task ID045-dependency-upgrade-compat

DifficultyMedium

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptSoftware Engineering & Codebase Maintenance · Task 12

The fixture package in $WORKSPACE/in/depsvc must be updated for a small dependency compatibility requirement.

Your job:

Edit $WORKSPACE/in/depsvc/requirements.txt and, if needed, $WORKSPACE/in/depsvc/slugger.py.
Follow $WORKSPACE/in/depsvc/compat_notes.md.
Keep the upgrade minimal: only the python-slugify pin should change.
Make python -m pytest $WORKSPACE/in/depsvc/tests pass.
Write $WORKSPACE/in/depsvc/compat_decision.md explaining the minimal upgrade, behavior risks, and why no unrelated dependencies were added.
Write $WORKSPACE/in/depsvc/compat_matrix.json listing behavior cases considered during the upgrade. It must include cases for default ASCII output, Unicode preservation, custom separators, invalid separators, empty input, non-string input, and repeated punctuation/whitespace collapse.
Write $WORKSPACE/in/depsvc/compat_risks.md summarizing behavior risks, accepted compatibility assumptions, rollback plan, and the exact commands/tests used to verify the upgrade.
Preserve the public make_slug(title, preserve_unicode=False, separator="-") signature.
Hidden checks will exercise Unicode preservation, invalid separators, empty input, no-argument/default behavior, non-string input, repeated punctuation, and explicit forwarding of allow_unicode/separator into the upgraded dependency.

Do not remove tests, do not add network calls, and do not vendor dependencies.

Input Files4 files

in/depsvc/compat_notes.md

in/depsvc/requirements.txt

in/depsvc/slugger.py

in/depsvc/tests/test_slugger.py

LLM Rubric

_REF = """
Tests + lock sanity by oracle; qualitatively: dependency research, phased edits, breakage triage vs blind bumps.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    project = w / "in" / "depsvc"
    if not project.exists():
        project = w / "depsvc"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    req_text = (project / "requirements.txt").read_text(encoding="utf-8", errors="replace").strip()
    lines = [line.strip() for line in req_text.splitlines() if line.strip() and not line.strip().startswith("#")]
    forbidden = [term for term in _GT["forbidden_requirement_terms"] if term in req_text.lower()]
    raw_exact = req_text == _GT["expected_requirement"]
    req_score = 1.0 if lines == [_GT["expected_requirement"]] and not forbidden and raw_exact else 0.0
    add("requirements_minimal", req_score == 1.0, 0.25, {"lines": lines, "forbidden": forbidden, "raw_exact": raw_exact})

    env = os.environ.copy()
    env["PYTHONPATH"] = str(project)
    result = subprocess.run(["python3", "-m", "pytest", "tests"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
    pytest_score = 1.0 if result.returncode == 0 else 0.0
    add("pytest", result.returncode == 0, 0.25, result.stdout[-1000:] + result.stderr[-1000:])

    code = (project / "slugger.py").read_text(encoding="utf-8", errors="replace")
    term_hits = sum(term in code for term in _GT["required_code_terms"])
    code_score = min(term_hits / len(_GT["required_code_terms"]), 1.0)
    behavior_score = 0.0
    try:
        spec = importlib.util.spec_from_file_location("slugger_under_test", project / "slugger.py")
        assert spec and spec.loader
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
        sig = inspect.signature(mod.make_slug)
        params = list(sig.parameters.values())
        signature_ok = (
            [p.name for p in params] == ["title", "preserve_unicode", "separator"]
            and params[1].default is False
            and params[2].default == "-"
        )
        behavior_checks = [
            mod.make_slug("Café API") == "cafe-api",
            mod.make_slug("Café API", preserve_unicode=True) == "café-api",
            mod.make_slug("Hello Billing API", separator="_") == "hello_billing_api",
            mod.make_slug("Hello,   Billing___API") == "hello-billing-api",
            mod.make_slug("") == "",
            signature_ok,
        ]
        try:
            mod.make_slug("Bad Separator", separator="/")
            behavior_checks.append(False)
        except ValueError:
            behavior_checks.append(True)
        try:
            mod.make_slug("Bad Separator", separator="--")
            behavior_checks.append(False)
        except ValueError:
            behavior_checks.append(True)
        try:
            mod.make_slug()
            behavior_checks.append(False)
        except TypeError:
            behavior_checks.append(True)
        try:
            mod.make_slug(None)
            behavior_checks.append(False)
        except ValueError:
            behavior_checks.append(True)
        behavior_checks.append(mod.make_slug("Hello,   Billing___API") == "hello-billing-api")
        behavior_checks.append(mod.make_slug("Hello...Billing", separator="_") == "hello_billing")

        fake_calls: list[dict[str, Any]] = []
        fake_module = types.ModuleType("slugify")
        def fake_slugify(value: str, separator: str = "-", allow_unicode: bool = False) -> str:
            fake_calls.append({"value": value, "separator": separator, "allow_unicode": allow_unicode})
            return f"{value}|{separator}|{allow_unicode}"
        fake_module.slugify = fake_slugify
        old_slugify = sys.modules.get("slugify")
        sys.modules["slugify"] = fake_module
        try:
            fake_spec = importlib.util.spec_from_file_location("slugger_fake_dep", project / "slugger.py")
            assert fake_spec and fake_spec.loader
            fake_mod = importlib.util.module_from_spec(fake_spec)
            fake_spec.loader.exec_module(fake_mod)
            fake_result = fake_mod.make_slug("Café API", preserve_unicode=True, separator="_")
            behavior_checks.append(fake_result == "Café API|_|True")
            behavior_checks.append(fake_calls and fake_calls[-1] == {"value": "Café API", "separator": "_", "allow_unicode": True})
        finally:
            if old_slugify is None:
                sys.modules.pop("slugify", None)
            else:
                sys.modules["slugify"] = old_slugify
        behavior_score = sum(behavior_checks) / len(behavior_checks)
    except Exception:
        behavior_score = 0.0
    compat_score = 0.45 * code_score + 0.55 * behavior_score
    add("compat_code", compat_score >= 0.85, 0.25, {"term_hits": term_hits, "behavior_score": behavior_score})

    decision = project / "compat_decision.md"
    decision_text = decision.read_text(encoding="utf-8", errors="replace").lower() if decision.is_file() else ""
    decision_score = sum(term.lower() in decision_text for term in _GT["decision_terms"]) / len(_GT["decision_terms"])
    add("compat_decision", decision_score >= 0.80, 0.08, {"score": decision_score})
    matrix_score = 0.0
    matrix = project / "compat_matrix.json"
    try:
        matrix_data = json.loads(matrix.read_text(encoding="utf-8"))
        matrix_text = json.dumps(matrix_data, ensure_ascii=False).lower()
        term_hits = sum(term.lower() in matrix_text for term in _GT["matrix_terms"])
        if isinstance(matrix_data, list):
            cases = matrix_data
        elif isinstance(matrix_data, dict):
            cases = matrix_data.get("cases") or matrix_data.get("behavior_cases") or []
        else:
            cases = []
        matrix_score = 0.65 * (term_hits / len(_GT["matrix_terms"])) + 0.35 * (isinstance(cases, list) and len(cases) >= len(_GT["matrix_terms"]))
    except Exception:
        matrix_score = 0.0
    add("compat_matrix", matrix_score >= 0.85, 0.10, {"score": round(matrix_score, 4)})
    risks = project / "compat_risks.md"
    risk_text = risks.read_text(encoding="utf-8", errors="replace").lower() if risks.is_file() else ""
    risk_score = sum(term.lower() in risk_text for term in _GT.get("risk_terms", [])) / max(len(_GT.get("risk_terms", [])), 1)
    add("compat_risks", risk_score >= 0.80, 0.08, {"score": round(risk_score, 4)})
    notes_ok = _md5(project / "compat_notes.md") == _NOTES_HASH
    add("notes_integrity", notes_ok, 0.05)
    total = req_score * 0.22 + pytest_score * 0.24 + compat_score * 0.24 + decision_score * 0.08 + matrix_score * 0.09 + risk_score * 0.08 + (1.0 if notes_ok else 0.0) * 0.05
    if risk_score < 0.60:
        total = min(total, 0.86)
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "045-dependency-upgrade-compat", "outcome_score": round(total, 4), "level": level, "checks": checks}

← Task 11 Task 13 →