Repair GitHub Actions CI Config

The fixture project has a broken GitHub Actions workflow at $WORKSPACE/in/project/.github/workflows/ci.yml.

Software Engineering & Codebase MaintenanceTask 11Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance ยท Task 11

The fixture project has a broken GitHub Actions workflow at $WORKSPACE/in/project/.github/workflows/ci.yml.

Read $WORKSPACE/in/project/CI_NOTES.md before editing. It contains required event safety, action version, caching, and path-filter constraints.

Your job:

  • Edit the workflow YAML so it is syntactically valid and runs the local Python tests.
  • Do not edit application code or tests.
  • Do not edit CI_NOTES.md.
  • Keep the workflow offline-friendly: no deployment, no secrets, no external service calls.
  • Include checkout, Python setup, dependency install from requirements.txt, and python -m pytest.
  • Use pinned major versions actions/checkout@v4 and actions/setup-python@v5.
  • Include a small Python version matrix for 3.10 and 3.11.
  • Include dependency caching for pip.
  • Add path filters so the workflow is relevant to app code, tests, requirements, and workflow changes.
  • Path filters must also include CI_NOTES.md.
  • Run both python -m pytest and a lightweight smoke command that imports normalize_percent from app.mathutil and calls it once.
  • Use push, pull_request, and workflow_dispatch, but do not use pull_request_target.

Also write $WORKSPACE/in/project/ci_design_notes.md explaining:

  • why the workflow is safe for pull requests,
  • why each path filter is included,
  • what the smoke command imports and calls,
  • why deployment/secrets/external service calls are intentionally absent.

The oracle will parse the YAML and run the configured local test command.

Input Files5 files
in/project/.github/workflows/ci.yml
in/project/CI_NOTES.md
in/project/app/mathutil.py
in/project/requirements.txt
in/project/test_mathutil.py
LLM Rubric
_REF = """
Oracle validates workflow shape. Evaluate tool choice, iterative CI reasoning, resilience to malformed logs โ€” standard three axes.
""".strip()

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _REF
    + "\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n\n'
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)
Completion Grader
def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


def _walk_strings(value: Any) -> list[str]:
    if isinstance(value, dict):
        return [str(key) for key in value.keys()] + [s for item in value.values() for s in _walk_strings(item)]
    if isinstance(value, list):
        return [s for item in value for s in _walk_strings(item)]
    return [str(value)]


def _has_timeout(job: Any) -> bool:
    return isinstance(job, dict) and "timeout-minutes" in job


def _matches_github_path(path: str, pattern: str) -> bool:
    path = path.strip("/")
    pattern = pattern.strip("/")
    if fnmatch.fnmatch(path, pattern):
        return True
    if pattern.endswith("/**"):
        return path == pattern[:-3] or path.startswith(pattern[:-2])
    if "/**/" in pattern:
        prefix, suffix = pattern.split("/**/", 1)
        if path == f"{prefix}/{suffix}" or fnmatch.fnmatch(path, f"{prefix}/{suffix}"):
            return True
        if path.startswith(f"{prefix}/"):
            return fnmatch.fnmatch(path[len(prefix) + 1:], suffix)
    return False


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    project = w / "in" / "project"
    if not project.exists():
        project = w / "project"
    workflow = project / ".github" / "workflows" / "ci.yml"
    checks: list[dict[str, Any]] = []

    def add(cid: str, ok: bool, weight: float, detail: Any = None) -> None:
        checks.append({"id": cid, "pass": bool(ok), "weight": weight, "detail": detail})

    parse_score = 0.0
    structure_score = 0.0
    path_filter_score = 0.0
    smoke_score = 0.0
    try:
        raw_yaml = workflow.read_text(encoding="utf-8")
        data = yaml.safe_load(raw_yaml)
        parse_score = 1.0 if isinstance(data, dict) else 0.0
        add("yaml_parse", parse_score == 1.0, 0.10)
        text = raw_yaml + "\n" + "\n".join(_walk_strings(data))
        term_hits = sum(term in text for term in _GT["required_yaml_terms"])
        forbidden = [term for term in _GT["forbidden_yaml_terms"] if term.lower() in text.lower()]
        has_jobs = isinstance(data.get("jobs"), dict) and "test" in data.get("jobs", {})
        matrix_text = json.dumps(data, ensure_ascii=False).lower()
        matrix_ok = "3.10" in matrix_text and "3.11" in matrix_text and "matrix" in matrix_text
        app_paths_ok = "app/" in matrix_text or "app/**" in matrix_text
        test_paths_ok = "test" in matrix_text
        paths_ok = "paths" in matrix_text and app_paths_ok and test_paths_ok and "requirements.txt" in matrix_text and ".github/workflows" in matrix_text and "ci_notes.md" in matrix_text
        events_ok = "pull_request" in matrix_text and "push" in matrix_text and "workflow_dispatch" in matrix_text
        permissions = data.get("permissions", {}) if isinstance(data, dict) else {}
        permissions_ok = isinstance(permissions, dict) and str(permissions.get("contents", "")).lower() == "read"
        concurrency = data.get("concurrency", {}) if isinstance(data, dict) else {}
        concurrency_ok = isinstance(concurrency, dict) and bool(concurrency.get("cancel-in-progress"))
        jobs = data.get("jobs", {}) if isinstance(data, dict) else {}
        timeout_ok = any(_has_timeout(job) for job in jobs.values()) if isinstance(jobs, dict) else False
        exact_score = sum([
            permissions_ok,
            concurrency_ok,
            timeout_ok,
            any(term in text for term in _GT.get("workflow_exact_terms", [])),
        ]) / 4
        structure_score = 0.35 * min(term_hits / len(_GT["required_yaml_terms"]), 1) + 0.13 * has_jobs + 0.13 * matrix_ok + 0.10 * paths_ok + 0.10 * events_ok + 0.06 * (not forbidden) + 0.13 * exact_score
        if not paths_ok:
            structure_score = min(structure_score, 0.65)
        add("ci_structure", structure_score >= 0.75, 0.18, {"term_hits": term_hits, "forbidden": forbidden, "paths_ok": paths_ok, "app_paths_ok": app_paths_ok, "test_paths_ok": test_paths_ok, "permissions_ok": permissions_ok, "concurrency_ok": concurrency_ok, "timeout_ok": timeout_ok})

        on_cfg = data.get("on", data.get(True, {})) if isinstance(data, dict) else {}
        path_sets: list[list[str]] = []
        if isinstance(on_cfg, dict):
            for event in ("push", "pull_request"):
                event_cfg = on_cfg.get(event, {})
                if isinstance(event_cfg, dict):
                    paths = event_cfg.get("paths", [])
                    if isinstance(paths, list):
                        path_sets.append([str(p) for p in paths])
        def matches_any(path: str, patterns: list[str]) -> bool:
            return any(_matches_github_path(path, pat) for pat in patterns)
        if path_sets:
            positive_hits = sum(all(matches_any(path, patterns) for patterns in path_sets) for path in _GT["path_filter_positive"])
            negative_hits = sum(not any(matches_any(path, patterns) for patterns in path_sets) for path in _GT["path_filter_negative"])
            path_filter_score = 0.75 * (positive_hits / len(_GT["path_filter_positive"])) + 0.25 * (negative_hits / len(_GT["path_filter_negative"]))
        add("path_filter_simulation", path_filter_score >= 0.95, 0.18, {"score": round(path_filter_score, 4)})
    except Exception as exc:
        add("yaml_parse", False, 0.10, str(exc))

    env = os.environ.copy()
    env["PYTHONPATH"] = str(project)
    result = subprocess.run(["python3", "-m", "pytest"], cwd=project, env=env, capture_output=True, text=True, timeout=20)
    test_score = 1.0 if result.returncode == 0 else 0.0
    add("local_tests", result.returncode == 0, 0.14, result.stdout[-1000:] + result.stderr[-1000:])

    smoke_cmds = []
    try:
        data = yaml.safe_load(workflow.read_text(encoding="utf-8"))
        for job in (data.get("jobs", {}) if isinstance(data, dict) else {}).values():
            for step in job.get("steps", []) if isinstance(job, dict) else []:
                run = step.get("run") if isinstance(step, dict) else None
                if isinstance(run, str) and ("python -c" in run or "python3 -c" in run):
                    smoke_cmds.append(run)
        smoke_results = []
        for cmd in smoke_cmds:
            argv = shlex.split(cmd)
            if len(argv) < 3 or argv[:2] not in (["python", "-c"], ["python3", "-c"]):
                smoke_results.append({"cmd": cmd, "returncode": 127, "stderr": "smoke command must be python -c or python3 -c"})
                continue
            if not all(term in cmd for term in _GT.get("smoke_command_terms", [])):
                smoke_results.append({"cmd": cmd, "returncode": 126, "stderr": "smoke command must import and call normalize_percent"})
                continue
            exec_argv = list(argv)
            if exec_argv[0] == "python":
                exec_argv[0] = "python3"
            proc = subprocess.run(exec_argv, cwd=project, env=env, capture_output=True, text=True, timeout=10)
            smoke_results.append({"cmd": cmd, "returncode": proc.returncode, "stderr": proc.stderr[-500:]})
        smoke_score = 1.0 if smoke_results and all(item["returncode"] == 0 for item in smoke_results) else 0.0
        add("smoke_commands", smoke_score == 1.0, 0.18, smoke_results or "missing python/python3 -c smoke command")
    except Exception as exc:
        add("smoke_commands", False, 0.18, str(exc))

    design_path = project / "ci_design_notes.md"
    design_text = design_path.read_text(encoding="utf-8", errors="replace").lower() if design_path.is_file() else ""
    design_score = sum(term.lower() in design_text for term in _GT.get("design_note_terms", [])) / max(len(_GT.get("design_note_terms", [])), 1)
    add("ci_design_notes", design_score >= 0.80, 0.08, {"score": round(design_score, 4)})

    intact = [(_md5(project / rel) == digest) for rel, digest in _HASHES.items()]
    integrity = sum(intact) / len(intact)
    add("code_integrity", integrity == 1.0, 0.14, {"score": integrity})
    total = parse_score * 0.10 + structure_score * 0.18 + path_filter_score * 0.18 + test_score * 0.14 + smoke_score * 0.18 + design_score * 0.08 + integrity * 0.14
    caps = []
    if parse_score < 1.0:
        caps.append(0.55)
    if path_filter_score < 0.95:
        caps.append(0.72)
    if smoke_score < 1.0:
        caps.append(0.72)
    if test_score < 1.0:
        caps.append(0.70)
    if integrity < 1.0:
        caps.append(0.65)
    if design_score < 0.60:
        caps.append(0.84)
    if caps:
        total = min(total, min(caps))
    thresholds = _GT["scoring"]["thresholds"]
    level = "excellent" if total >= thresholds["excellent"] else "good" if total >= thresholds["good"] else "pass" if total >= thresholds["pass"] else "fail"
    return {"task": "044-ci-config-repair", "outcome_score": round(total, 4), "level": level, "checks": checks}