Deep Dictionary Merge Bug Fix with Pytest Closure

# Core Component Bug Fix Task

Software Engineering & Codebase MaintenanceTask 3Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance · Task 3

Core Component Bug Fix Task

1. Task Background

Guang Xiang recently submitted a Python utility script for merging environment configuration files (located at $WORKSPACE/in/app/config_manager.py). However, the CI pipeline failed, indicating that the associated unit tests did not fully pass. You need to act as a development agent: run the test cases to locate the bug and fix the source code.

2. Core Subtasks

  • Run tests: Execute pytest $WORKSPACE/in/app/test_config.py using the system terminal, read and understand the error stack trace output.
  • Locate and fix the bug: Analyze the logic flaw in config_manager.py, modify the file to correctly handle deep update of nested dictionaries.
  • Verify the fix: Run pytest again to ensure all test cases pass 100%.

3. Required Artifacts

After completing the task, you must produce:

  • The successfully fixed $WORKSPACE/in/app/config_manager.py file (modify and save the original file directly).
  • Generate $WORKSPACE/out/progress.md: a brief record of your troubleshooting, root cause analysis, and the final fix logic.

4. Constraints

  • No cheating: Absolutely do not modify any code or assertions in the test file test_config.py.
  • Minimal changes: Fix only the bug‑causing logic; do not rewrite the entire file or introduce third‑party libraries (e.g., copy.deepcopy is prohibited; implement manually).
  • Additional constraint: In your fix, when checking whether a value is a dictionary, you must NOT use isinstance(value, dict). You may use alternatives such as type(value) is dict or hasattr(value, 'items').
Input Files2 files
in/app/config_manager.py
in/app/test_config.py
Hooks
def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(context["workspace"])
    (workspace / "out").mkdir(parents=True, exist_ok=True)

    progress_file = workspace / "out" / "progress.md"
    if not progress_file.exists():
        progress_file.write_text(
            "# Code Repair Pytest Task Progress\n\n"
            "## Task Progress\n\n"
            "- [ ] Run pytest app/test_config.py\n"
            "- [ ] Analyze failure and identify shallow merge bug in config_manager.py\n"
            "- [ ] Fix deep_update with recursive merge logic\n"
            "- [ ] Verify pytest passes and record the repair details\n\n",
            encoding="utf-8",
        )

    return {
        "TASK_ID": "016-code-repair-pytest",
        "TASK_NAME": "Deep Dictionary Merge Bug Fix with Pytest Closure",
        "TEST_FILE": str(workspace / "in" / "app" / "test_config.py"),
        "SOURCE_FILE": str(workspace / "in" / "app" / "config_manager.py"),
        "PROGRESS_FILE": str(progress_file),
    }


def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
    return runtime_state


def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
    pass
LLM Rubric
_REF = """
Task: run pytest on app/test_config.py; fix deep_update in app/config_manager.py for recursive dict merge; do not change tests; rerun until green. Document in out/progress.md.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Evaluate the agent run from the **proxy trace JSON** below (criteria each 0.0-1.0):
- tool_use_appropriate: pytest/editor/exec usage appropriate; fix targets config_manager.py not tests.
- consistency: reproduce failure → localize bug → minimal recursive fix → verify with pytest.
- robustness: score 1.0 if clean; else iterative debugging quality.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}"""
)
Completion Grader
def _read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8")


def _file_md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def _run_pytest(workspace: Path) -> int:
    result = subprocess.run(
        ["pytest", "app/test_config.py"],
        cwd=workspace / "in",
        capture_output=True,
        text=True,
    )
    return result.returncode

def _check_constraints(source_path: Path) -> tuple[bool, dict[str, bool]]:
    """Check constraints: no deepcopy, no isinstance(dict), recursive call."""
    if not source_path.exists():
        return False, {}
    source = _read_text(source_path)
    no_deepcopy = "deepcopy" not in source
    no_isinstance_dict = not re.search(r"isinstance\s*\([^,)]+\s*,\s*dict\)", source)
    # Check recursive call: 'deep_update(' appears inside the function body
    # Simple approach: find function body and see if it calls deep_update
    recursive = bool(re.search(r"def\s+deep_update.*?:\s*(?:.*\n)*?\s+deep_update\s*\(", source, re.DOTALL))
    constraints = {
        "no_deepcopy": no_deepcopy,
        "no_isinstance_dict": no_isinstance_dict,
        "recursive": recursive,
    }
    return all(constraints.values()), constraints

def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    test_path = w / "in" / "app" / "test_config.py"
    source_path = w / "in" / "app" / "config_manager.py"
    progress_path = w / "out" / "progress.md"

    checks: list[dict[str, Any]] = []

    # 1. Test file hash unchanged (0.30)
    test_hash = _file_md5(test_path) if test_path.exists() else ""
    test_hash_ok = test_hash == EXPECTED_TEST_HASH
    checks.append({
        "id": "test_file_hash_unchanged",
        "label": "app/test_config.py hash remains unchanged",
        "pass": test_hash_ok,
        "weight": 0.30,
        "detail": f"expected {EXPECTED_TEST_HASH}, actual {test_hash}",
    })

    # 2. pytest exit code zero (0.60)
    pytest_return = _run_pytest(w) if test_path.exists() else -1
    pytest_ok = pytest_return == 0
    checks.append({
        "id": "pytest_exit_code_zero",
        "label": "pytest app/test_config.py returns exit code 0",
        "pass": pytest_ok,
        "weight": 0.60,
        "detail": f"return code: {pytest_return}",
    })

    # 3. progress.md quality (0.10)
    progress_ok = False
    progress_text = _read_text(progress_path).lower() if progress_path.exists() else ""
    if progress_text and "deep_update" in progress_text and "test" in progress_text:
        progress_ok = True
        
    checks.append({
        "id": "report_quality",
        "label": "progress.md records the failure analysis and fix approach",
        "pass": progress_ok,
        "weight": 0.10,
        "detail": f"progress ok: {progress_ok}",
    })

    # 4. Additional constraints (weight 0, but will apply penalty if failed)
    constraints_ok, constraint_details = _check_constraints(source_path)
    checks.append({
        "id": "constraints_no_deepcopy_no_isinstance_recursive",
        "label": "Code respects constraints: no deepcopy, no isinstance(dict), recursive call",
        "pass": constraints_ok,
        "weight": 0.0,  # No direct weight, applied as penalty multiplier
        "detail": f"no_deepcopy={constraint_details.get('no_deepcopy', False)}, "
                  f"no_isinstance_dict={constraint_details.get('no_isinstance_dict', False)}, "
                  f"recursive={constraint_details.get('recursive', False)}",
    })

    # Compute base total (weights sum to 1.0)
    total = (
        (1.0 if pytest_ok else 0.0) * 0.60 +
        (1.0 if test_hash_ok else 0.0) * 0.30 +
        (1.0 if progress_ok else 0.0) * 0.10
    )
    # Apply penalty if constraints not satisfied (0.9 multiplier)
    if not constraints_ok:
        total *= 0.9

    thresholds = {"excellent": 0.90, "good": 0.75, "pass": 0.60}

    if total >= thresholds["excellent"]:
        grade = "excellent"
    elif total >= thresholds["good"]:
        grade = "good"
    elif total >= thresholds["pass"]:
        grade = "pass"
    else:
        grade = "fail"

    return {
        "grade": grade,
        "score": round(total, 3),
        "outcome_score": round(float(total), 4),
        "max_score": 1.0,
        "checks": checks,
        "weights": {"tests_passed": 0.60, "test_file_untouched": 0.30, "report_quality": 0.10},
        "thresholds": thresholds,
    }