Review A Git PR Branch Then Merge And Push It

Working directory: $WORKSPACE (a git clone of the repo; origin points to the harness bare repo).

Software Engineering & Codebase MaintenanceTask 1Oracle + LLM scoring
Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance · Task 1

Working directory: $WORKSPACE (a git clone of the repo; origin points to the harness bare repo).

Context:

  • Current branch is main, aligned with origin/main.
  • Remote has PR branch feature/pr-add-doc: one commit ahead of main, adding CONTRIBUTING.md (includes marker BENCH_PR_OK); no other malicious changes expected.

Tasks:

  1. Use git fetch / git diff / git log to review main…feature/pr-add-doc and decide whether to merge.
  2. Write your review to $WORKSPACE/out/review.txt: at least one line; if you approve merging, the first line must start with APPROVE (optionally with a short reason, e.g. APPROVE: ...).
  3. If you choose to merge: git merge that PR branch into main (fast-forward or merge commit allowed), then git push origin main to update the remote.

Notes:

  • If git warns about safe.directory, run once git config --global --add safe.directory "$WORKSPACE" or use your environment’s equivalent.
  • mkdir -p $WORKSPACE/out if the directory is missing.
Input FilesNo static fixture files

No static fixture files.

Hooks
def _run(cmd: list[str], *, cwd: Path | None = None) -> None:
    completed = subprocess.run(cmd, cwd=str(cwd) if cwd else None, capture_output=True, text=True, check=False)
    if completed.returncode != 0:
        raise RuntimeError(f"command failed: {' '.join(cmd)}\n{completed.stderr.strip()}")


def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
    sandbox = Path(runtime["sandbox"])
    workspace = Path(runtime["workspace"])
    init_repo = sandbox / "init-repo"
    remote = sandbox / "remote.git"
    init_repo.mkdir(parents=True, exist_ok=True)

    git_name = "Bench User"
    git_email = "bench-pr@local"

    _run(["git", "init", "--bare", str(remote)])
    _run(["git", "init", "-b", "main"], cwd=init_repo)
    _run(["git", "config", "user.name", git_name], cwd=init_repo)
    _run(["git", "config", "user.email", git_email], cwd=init_repo)
    (init_repo / "README.md").write_text("# Bench project\n", encoding="utf-8")
    _run(["git", "add", "README.md"], cwd=init_repo)
    _run(["git", "commit", "-m", "chore: init"], cwd=init_repo)
    _run(["git", "remote", "add", "origin", str(remote.resolve())], cwd=init_repo)
    _run(["git", "push", "-u", "origin", "main"], cwd=init_repo)
    _run(["git", "--git-dir", str(remote), "symbolic-ref", "HEAD", "refs/heads/main"])

    _run(["git", "checkout", "-b", "feature/pr-add-doc"], cwd=init_repo)
    (init_repo / "CONTRIBUTING.md").write_text("# Contributing\n\nBENCH_PR_OK follow project rules.\n", encoding="utf-8")
    _run(["git", "add", "CONTRIBUTING.md"], cwd=init_repo)
    _run(["git", "commit", "-m", "docs: add contributing guidelines"], cwd=init_repo)
    _run(["git", "push", "-u", "origin", "feature/pr-add-doc"], cwd=init_repo)

    # runner 会先 _copy_fixtures,即使无 fixtures 也会建 workspace/in、workspace/out;
    # git clone 要求目标目录不存在或为空,故先清空再 clone。
    if workspace.exists():
        shutil.rmtree(workspace)
    _run(["git", "clone", str(remote.resolve()), str(workspace)])
    _run(["git", "config", "user.name", git_name], cwd=workspace)
    _run(["git", "config", "user.email", git_email], cwd=workspace)
    return {"REMOTE_PATH": str(remote.resolve())}
LLM Rubric
USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    + """Theme: review and merge local git PR branch **feature/pr-add-doc** into **main**, write **out/review.txt**, and push **origin main**.

Evaluate the agent run (criteria each 0.0-1.0):
- tool_use_appropriate: prioritize git inspection commands such as **fetch**, **diff**, **log**, then review-file write, merge, and push; skipping review evidence or using irrelevant tools lowers score.
- consistency: inspect PR branch changes → decide approve/reject → write **out/review.txt** → if approving, merge into **main** and **push origin main**.
- robustness: safe.directory warnings, merge conflicts, wrong branch state, or push failures are handled or surfaced instead of ignored.

"""
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}

Optional **total**: harness recomputes mean of three process scores.

--- PROXY TRACE JSON BELOW ---
{payload}
"""
)
Completion Grader
def _git(cwd: Path, *args: str, timeout: int = 60) -> subprocess.CompletedProcess[str]:
    return subprocess.run(
        ["git", "-C", str(cwd), *args],
        capture_output=True,
        text=True,
        timeout=timeout,
        check=False,
    )


def _origin_to_bare_path(url: str) -> Path | None:
    u = url.strip()
    if u.startswith("file:"):
        # file:///path or file:/path
        from urllib.parse import unquote, urlparse

        p = urlparse(u).path
        if p.startswith("/") and len(p) > 2 and p[2] == ":":
            p = p[1:]
        return Path(unquote(p))
    p = Path(u)
    if p.is_absolute() and p.exists():
        return p
    return None


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    weight = 0.25
    checks: list[dict[str, Any]] = []

    if not _GT.is_file():
        return {
            "task": "009-git-pr-merge",
            "workspace": str(w),
            "checks": [],
            "outcome_score": 0.0,
            "error": f"missing ground_truth.json: {_GT}",
        }

    gt = json.loads(_GT.read_text(encoding="utf-8"))
    pr_branch = str(gt.get("pr_branch") or "feature/pr-add-doc")
    marker = str(gt.get("approve_marker_in_contributing") or "BENCH_PR_OK")
    review_rel = str(gt.get("review_relative_path") or "out/review.txt")
    approve_needle = str(gt.get("approve_line_must_start_with") or "APPROVE")

    if not (w / ".git").is_dir():
        return {
            "task": "009-git-pr-merge",
            "workspace": str(w),
            "checks": [],
            "outcome_score": 0.0,
            "error": "workspace is not a git clone (.git missing)",
        }

    gr = _git(w, "remote", "get-url", "origin")
    bare: Path | None = None
    if gr.returncode == 0 and gr.stdout.strip():
        bare = _origin_to_bare_path(gr.stdout.strip())
    if bare is None or not bare.is_dir():
        checks.append(
            {
                "id": "origin_bare",
                "label": "resolve origin to local bare repo path",
                "pass": False,
                "weight": weight,
                "detail": gr.stdout.strip() if gr.returncode == 0 else gr.stderr.strip(),
            }
        )
        outcome = 0.0
        return {
            "task": "009-git-pr-merge",
            "workspace": str(w),
            "checks": checks,
            "outcome_score": outcome,
        }

    # 1) review.txt first meaningful line starts with APPROVE
    review_path = w / Path(review_rel)
    approve_ok = False
    detail_review = None
    if review_path.is_file():
        try:
            for line in review_path.read_text(encoding="utf-8", errors="replace").splitlines():
                s = line.strip()
                if not s:
                    continue
                approve_ok = s.upper().startswith(approve_needle.upper())
                if not approve_ok:
                    detail_review = f"first non-empty line: {s[:120]!r}"
                break
            else:
                detail_review = "empty file"
        except OSError as e:
            detail_review = str(e)
    else:
        detail_review = "missing review file"
    checks.append(
        {
            "id": "review_approve",
            "label": f"{review_rel} first non-empty line starts with {approve_needle!r}",
            "pass": approve_ok,
            "weight": weight,
            "detail": None if approve_ok else detail_review,
        }
    )

    # 2) CONTRIBUTING.md on bare main contains marker
    show = _git(bare, "show", "main:CONTRIBUTING.md")
    contrib_ok = show.returncode == 0 and marker in (show.stdout or "")
    checks.append(
        {
            "id": "bare_contributing",
            "label": f"bare main:CONTRIBUTING.md contains {marker!r}",
            "pass": contrib_ok,
            "weight": weight,
            "detail": None if contrib_ok else (show.stderr.strip() or show.stdout[:200]),
        }
    )

    # 3) workspace main == bare main (pushed)
    wm = _git(w, "rev-parse", "main")
    bm = _git(bare, "rev-parse", "main")
    push_ok = (
        wm.returncode == 0
        and bm.returncode == 0
        and wm.stdout.strip() == bm.stdout.strip()
        and len(wm.stdout.strip()) >= 7
    )
    checks.append(
        {
            "id": "main_synced",
            "label": "local main SHA equals bare refs/heads/main (push ok)",
            "pass": push_ok,
            "weight": weight,
            "detail": None
            if push_ok
            else f"ws={wm.stdout.strip()[:12]} bare={bm.stdout.strip()[:12]}",
        }
    )

    # 4) PR branch tip is ancestor of main on bare (merged)
    tip = _git(bare, "rev-parse", pr_branch)
    anc = _git(bare, "merge-base", "--is-ancestor", tip.stdout.strip(), "main")
    merge_ok = tip.returncode == 0 and anc.returncode == 0
    checks.append(
        {
            "id": "pr_merged",
            "label": f"bare: tip of {pr_branch!r} is ancestor of main",
            "pass": merge_ok,
            "weight": weight,
            "detail": None if merge_ok else tip.stderr.strip() or anc.stderr.strip(),
        }
    )

    outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
    return {
        "task": "009-git-pr-merge",
        "workspace": str(w),
        "checks": checks,
        "outcome_score": outcome,
    }