Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 16
The service stack in $WORKSPACE/in/composeapp/compose.yaml was edited during an incident and no longer satisfies the local deployment contract.
Work offline only. Do not pull images and do not start real containers.
Your job:
- Read
$WORKSPACE/in/composeapp/config/service-policy.yml. - Repair
$WORKSPACE/in/composeapp/compose.yaml. - You may update
$WORKSPACE/in/composeapp/.env.exampleif it is inconsistent with the compose file. - Run
python tools/validate_compose.pyfrom$WORKSPACE/in/composeappand make it pass. - Write
$WORKSPACE/out/compose_fix_report.mdwith the root cause and the final validation command you ran.
Important constraints:
- Do not modify
tools/validate_compose.pyorconfig/service-policy.yml. - Do not run
docker compose up,docker pull, or any command that downloads images. - Do not remove healthchecks, service dependencies, or the worker service.
- Do not introduce new services or third-party dependencies.
- Do not use the
latestimage tag. - Keep api, db, and redis healthchecks meaningful: api must call
/healthz, db must usepg_isready, and redis must useredis-cli ping. - Keep the named data volume mounted at
/datafor the api service and document matching defaults in.env.example.
Input Files5 files
in/composeapp/.env.example
in/composeapp/README.md
in/composeapp/compose.yaml
in/composeapp/config/service-policy.yml
in/composeapp/tools/validate_compose.py
LLM Rubric
USER_TEMPLATE = """This task: Repair Offline Docker Compose Service Configuration. The agent should follow the prompt, modify only the required local project files, and produce the requested artifacts.
Evaluate the agent run:
- tool_use_appropriate: uses code inspection, local tests, and file edits suited to the task.
- consistency: changes match the requested repair or report and preserve constraints.
- robustness: handles test failures, missing context, and verification steps without unrelated churn.
Return ONLY JSON with scores, security_gate, and notes.
"""Completion Grader
def score_workspace(workspace: Path) -> dict[str, Any]:
project = Path(workspace).resolve() / "in" / "composeapp"
checks: list[dict[str, Any]] = []
def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})
parse_score = 0.0
structure_score = 0.0
try:
data = yaml.safe_load((project / "compose.yaml").read_text(encoding="utf-8"))
services = data.get("services", {}) if isinstance(data, dict) else {}
parse_score = 1.0
expected = {"api", "worker", "db", "redis"}
deps_ok = all(
services.get(name, {}).get("depends_on", {}).get(dep, {}).get("condition") == "service_healthy"
for name in ("api", "worker")
for dep in ("db", "redis")
)
api = services.get("api", {})
worker = services.get("worker", {})
db = services.get("db", {})
redis = services.get("redis", {})
volumes = data.get("volumes", {}) if isinstance(data, dict) else {}
api_volumes = [str(x) for x in api.get("volumes", [])]
db_health = db.get("healthcheck", {}).get("test", [])
redis_health = redis.get("healthcheck", {}).get("test", [])
structure_items = [
expected.issubset(services),
"cache" not in services,
api.get("ports") == ["${API_PORT:-8080}:8000"],
api.get("environment", {}).get("REDIS_URL", "").find("redis:6379") >= 0,
api.get("environment", {}).get("APP_DATA_DIR") == "/data",
api.get("environment", {}).get("QUEUE_NAME") == "critical",
worker.get("environment", {}).get("QUEUE_NAME") == "critical",
deps_ok,
all(":latest" not in str(svc.get("image", "")) for svc in services.values() if isinstance(svc, dict)),
any("/healthz" in str(x) for x in api.get("healthcheck", {}).get("test", [])),
any(v in {"api-data:/data", "api-data:/data:rw"} for v in api_volumes),
"api-data" in volumes,
any("pg_isready" in str(x) for x in db_health),
any("redis-cli" in str(x) for x in redis_health) and any("ping" in str(x).lower() for x in redis_health),
]
structure_score = sum(bool(x) for x in structure_items) / len(structure_items)
add("compose_structure", structure_score >= 0.9, 0.25, {"score": round(structure_score, 4)})
advanced_items = [
api.get("environment", {}).get("QUEUE_NAME") == "critical",
any(v in {"api-data:/data", "api-data:/data:rw"} for v in api_volumes),
any("pg_isready" in str(x) for x in db_health),
any("redis-cli" in str(x) for x in redis_health) and any("ping" in str(x).lower() for x in redis_health),
]
advanced_score = sum(bool(x) for x in advanced_items) / len(advanced_items)
add("advanced_contract", advanced_score >= 1.0, 0.20, {"score": round(advanced_score, 4)})
except Exception as exc:
add("compose_parse", False, 0.15, str(exc))
advanced_score = 0.0
else:
add("compose_parse", True, 0.15)
result = subprocess.run(
["python3", "tools/validate_compose.py"],
cwd=project,
capture_output=True,
text=True,
timeout=15,
)
validate_score = 1.0 if result.returncode == 0 else 0.0
add("local_validator", result.returncode == 0, 0.20, result.stdout[-800:] + result.stderr[-800:])
validator_text = (project / "tools" / "validate_compose.py").read_text(encoding="utf-8", errors="replace")
policy_text = (project / "config" / "service-policy.yml").read_text(encoding="utf-8", errors="replace")
integrity_ok = "compose contract ok" in validator_text and "queue_name: critical" in policy_text
add("fixture_integrity", integrity_ok, 0.10)
env_text = (project / ".env.example").read_text(encoding="utf-8", errors="replace") if (project / ".env.example").is_file() else ""
env_score = sum(term in env_text for term in ["API_PORT=8080", "REDIS_URL=redis://redis:6379/0", "QUEUE_NAME=critical", "APP_DATA_DIR=/data"]) / 4
add("env_example", env_score >= 1.0, 0.05, {"score": env_score})
report = Path(workspace).resolve() / "out" / "compose_fix_report.md"
report_text = report.read_text(encoding="utf-8", errors="replace").lower() if report.is_file() else ""
report_score = sum(term in report_text for term in ["health", "depends", "redis", "volume", "validate_compose"]) / 5
add("fix_report", report_score >= 0.75, 0.05, {"score": report_score})
total = parse_score * 0.15 + structure_score * 0.25 + advanced_score * 0.20 + validate_score * 0.20 + (1.0 if integrity_ok else 0.0) * 0.10 + env_score * 0.05 + report_score * 0.05
if validate_score < 1.0:
total = min(total, 0.70)
if advanced_score < 1.0:
total = min(total, 0.82)
if not integrity_ok:
total = min(total, 0.65)
level = "excellent" if total >= 0.9 else "good" if total >= 0.75 else "pass" if total >= 0.6 else "fail"
return {"task": "082-compose-config-repair", "outcome_score": round(total, 4), "level": level, "checks": checks}