Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance · Task 3
Core Component Bug Fix Task
1. Task Background
Guang Xiang recently submitted a Python utility script for merging environment configuration files (located at $WORKSPACE/in/app/config_manager.py). However, the CI pipeline failed, indicating that the associated unit tests did not fully pass. You need to act as a development agent: run the test cases to locate the bug and fix the source code.
2. Core Subtasks
- Run tests: Execute
pytest $WORKSPACE/in/app/test_config.pyusing the system terminal, read and understand the error stack trace output. - Locate and fix the bug: Analyze the logic flaw in
config_manager.py, modify the file to correctly handle deep update of nested dictionaries. - Verify the fix: Run
pytestagain to ensure all test cases pass 100%.
3. Required Artifacts
After completing the task, you must produce:
- The successfully fixed
$WORKSPACE/in/app/config_manager.pyfile (modify and save the original file directly). - Generate
$WORKSPACE/out/progress.md: a brief record of your troubleshooting, root cause analysis, and the final fix logic.
4. Constraints
- No cheating: Absolutely do not modify any code or assertions in the test file
test_config.py. - Minimal changes: Fix only the bug‑causing logic; do not rewrite the entire file or introduce third‑party libraries (e.g.,
copy.deepcopyis prohibited; implement manually). - Additional constraint: In your fix, when checking whether a value is a dictionary, you must NOT use
isinstance(value, dict). You may use alternatives such astype(value) is dictorhasattr(value, 'items').
Input Files2 files
in/app/config_manager.py
in/app/test_config.py
Hooks
def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
workspace = Path(context["workspace"])
(workspace / "out").mkdir(parents=True, exist_ok=True)
progress_file = workspace / "out" / "progress.md"
if not progress_file.exists():
progress_file.write_text(
"# Code Repair Pytest Task Progress\n\n"
"## Task Progress\n\n"
"- [ ] Run pytest app/test_config.py\n"
"- [ ] Analyze failure and identify shallow merge bug in config_manager.py\n"
"- [ ] Fix deep_update with recursive merge logic\n"
"- [ ] Verify pytest passes and record the repair details\n\n",
encoding="utf-8",
)
return {
"TASK_ID": "016-code-repair-pytest",
"TASK_NAME": "Deep Dictionary Merge Bug Fix with Pytest Closure",
"TEST_FILE": str(workspace / "in" / "app" / "test_config.py"),
"SOURCE_FILE": str(workspace / "in" / "app" / "config_manager.py"),
"PROGRESS_FILE": str(progress_file),
}
def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
return runtime_state
def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
passLLM Rubric
_REF = """
Task: run pytest on app/test_config.py; fix deep_update in app/config_manager.py for recursive dict merge; do not change tests; rerun until green. Document in out/progress.md.
""".strip()
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _REF
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\n"
+ """Evaluate the agent run from the **proxy trace JSON** below (criteria each 0.0-1.0):
- tool_use_appropriate: pytest/editor/exec usage appropriate; fix targets config_manager.py not tests.
- consistency: reproduce failure → localize bug → minimal recursive fix → verify with pytest.
- robustness: score 1.0 if clean; else iterative debugging quality.
"""
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
Optional **total**: harness recomputes mean of three process scores.
--- PROXY TRACE JSON BELOW ---
{payload}"""
)Completion Grader
def _read_text(path: Path) -> str:
return path.read_text(encoding="utf-8")
def _file_md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
def _run_pytest(workspace: Path) -> int:
result = subprocess.run(
["pytest", "app/test_config.py"],
cwd=workspace / "in",
capture_output=True,
text=True,
)
return result.returncode
def _check_constraints(source_path: Path) -> tuple[bool, dict[str, bool]]:
"""Check constraints: no deepcopy, no isinstance(dict), recursive call."""
if not source_path.exists():
return False, {}
source = _read_text(source_path)
no_deepcopy = "deepcopy" not in source
no_isinstance_dict = not re.search(r"isinstance\s*\([^,)]+\s*,\s*dict\)", source)
# Check recursive call: 'deep_update(' appears inside the function body
# Simple approach: find function body and see if it calls deep_update
recursive = bool(re.search(r"def\s+deep_update.*?:\s*(?:.*\n)*?\s+deep_update\s*\(", source, re.DOTALL))
constraints = {
"no_deepcopy": no_deepcopy,
"no_isinstance_dict": no_isinstance_dict,
"recursive": recursive,
}
return all(constraints.values()), constraints
def score_workspace(workspace: Path) -> dict[str, Any]:
w = workspace.resolve()
test_path = w / "in" / "app" / "test_config.py"
source_path = w / "in" / "app" / "config_manager.py"
progress_path = w / "out" / "progress.md"
checks: list[dict[str, Any]] = []
# 1. Test file hash unchanged (0.30)
test_hash = _file_md5(test_path) if test_path.exists() else ""
test_hash_ok = test_hash == EXPECTED_TEST_HASH
checks.append({
"id": "test_file_hash_unchanged",
"label": "app/test_config.py hash remains unchanged",
"pass": test_hash_ok,
"weight": 0.30,
"detail": f"expected {EXPECTED_TEST_HASH}, actual {test_hash}",
})
# 2. pytest exit code zero (0.60)
pytest_return = _run_pytest(w) if test_path.exists() else -1
pytest_ok = pytest_return == 0
checks.append({
"id": "pytest_exit_code_zero",
"label": "pytest app/test_config.py returns exit code 0",
"pass": pytest_ok,
"weight": 0.60,
"detail": f"return code: {pytest_return}",
})
# 3. progress.md quality (0.10)
progress_ok = False
progress_text = _read_text(progress_path).lower() if progress_path.exists() else ""
if progress_text and "deep_update" in progress_text and "test" in progress_text:
progress_ok = True
checks.append({
"id": "report_quality",
"label": "progress.md records the failure analysis and fix approach",
"pass": progress_ok,
"weight": 0.10,
"detail": f"progress ok: {progress_ok}",
})
# 4. Additional constraints (weight 0, but will apply penalty if failed)
constraints_ok, constraint_details = _check_constraints(source_path)
checks.append({
"id": "constraints_no_deepcopy_no_isinstance_recursive",
"label": "Code respects constraints: no deepcopy, no isinstance(dict), recursive call",
"pass": constraints_ok,
"weight": 0.0, # No direct weight, applied as penalty multiplier
"detail": f"no_deepcopy={constraint_details.get('no_deepcopy', False)}, "
f"no_isinstance_dict={constraint_details.get('no_isinstance_dict', False)}, "
f"recursive={constraint_details.get('recursive', False)}",
})
# Compute base total (weights sum to 1.0)
total = (
(1.0 if pytest_ok else 0.0) * 0.60 +
(1.0 if test_hash_ok else 0.0) * 0.30 +
(1.0 if progress_ok else 0.0) * 0.10
)
# Apply penalty if constraints not satisfied (0.9 multiplier)
if not constraints_ok:
total *= 0.9
thresholds = {"excellent": 0.90, "good": 0.75, "pass": 0.60}
if total >= thresholds["excellent"]:
grade = "excellent"
elif total >= thresholds["good"]:
grade = "good"
elif total >= thresholds["pass"]:
grade = "pass"
else:
grade = "fail"
return {
"grade": grade,
"score": round(total, 3),
"outcome_score": round(float(total), 4),
"max_score": 1.0,
"checks": checks,
"weights": {"tests_passed": 0.60, "test_file_untouched": 0.30, "report_quality": 0.10},
"thresholds": thresholds,
}