Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptSoftware Engineering & Codebase Maintenance · Task 1
Working directory: $WORKSPACE (a git clone of the repo; origin points to the harness bare repo).
Context:
- Current branch is main, aligned with
origin/main. - Remote has PR branch
feature/pr-add-doc: one commit ahead of main, addingCONTRIBUTING.md(includes markerBENCH_PR_OK); no other malicious changes expected.
Tasks:
- Use
git fetch/git diff/git logto review main…feature/pr-add-doc and decide whether to merge. - Write your review to
$WORKSPACE/out/review.txt: at least one line; if you approve merging, the first line must start withAPPROVE(optionally with a short reason, e.g.APPROVE: ...). - If you choose to merge:
git mergethat PR branch into main (fast-forward or merge commit allowed), thengit push origin mainto update the remote.
Notes:
- If
gitwarns aboutsafe.directory, run oncegit config --global --add safe.directory "$WORKSPACE"or use your environment’s equivalent. mkdir -p $WORKSPACE/outif the directory is missing.
Input FilesNo static fixture files
No static fixture files.
Hooks
def _run(cmd: list[str], *, cwd: Path | None = None) -> None:
completed = subprocess.run(cmd, cwd=str(cwd) if cwd else None, capture_output=True, text=True, check=False)
if completed.returncode != 0:
raise RuntimeError(f"command failed: {' '.join(cmd)}\n{completed.stderr.strip()}")
def prepare_runtime(runtime: dict[str, Any]) -> dict[str, Any]:
sandbox = Path(runtime["sandbox"])
workspace = Path(runtime["workspace"])
init_repo = sandbox / "init-repo"
remote = sandbox / "remote.git"
init_repo.mkdir(parents=True, exist_ok=True)
git_name = "Bench User"
git_email = "bench-pr@local"
_run(["git", "init", "--bare", str(remote)])
_run(["git", "init", "-b", "main"], cwd=init_repo)
_run(["git", "config", "user.name", git_name], cwd=init_repo)
_run(["git", "config", "user.email", git_email], cwd=init_repo)
(init_repo / "README.md").write_text("# Bench project\n", encoding="utf-8")
_run(["git", "add", "README.md"], cwd=init_repo)
_run(["git", "commit", "-m", "chore: init"], cwd=init_repo)
_run(["git", "remote", "add", "origin", str(remote.resolve())], cwd=init_repo)
_run(["git", "push", "-u", "origin", "main"], cwd=init_repo)
_run(["git", "--git-dir", str(remote), "symbolic-ref", "HEAD", "refs/heads/main"])
_run(["git", "checkout", "-b", "feature/pr-add-doc"], cwd=init_repo)
(init_repo / "CONTRIBUTING.md").write_text("# Contributing\n\nBENCH_PR_OK follow project rules.\n", encoding="utf-8")
_run(["git", "add", "CONTRIBUTING.md"], cwd=init_repo)
_run(["git", "commit", "-m", "docs: add contributing guidelines"], cwd=init_repo)
_run(["git", "push", "-u", "origin", "feature/pr-add-doc"], cwd=init_repo)
# runner 会先 _copy_fixtures,即使无 fixtures 也会建 workspace/in、workspace/out;
# git clone 要求目标目录不存在或为空,故先清空再 clone。
if workspace.exists():
shutil.rmtree(workspace)
_run(["git", "clone", str(remote.resolve()), str(workspace)])
_run(["git", "config", "user.name", git_name], cwd=workspace)
_run(["git", "config", "user.email", git_email], cwd=workspace)
return {"REMOTE_PATH": str(remote.resolve())}LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\n"
+ """Theme: review and merge local git PR branch **feature/pr-add-doc** into **main**, write **out/review.txt**, and push **origin main**.
Evaluate the agent run (criteria each 0.0-1.0):
- tool_use_appropriate: prioritize git inspection commands such as **fetch**, **diff**, **log**, then review-file write, merge, and push; skipping review evidence or using irrelevant tools lowers score.
- consistency: inspect PR branch changes → decide approve/reject → write **out/review.txt** → if approving, merge into **main** and **push origin main**.
- robustness: safe.directory warnings, merge conflicts, wrong branch state, or push failures are handled or surfaced instead of ignored.
"""
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
+ """Return ONLY JSON:
{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}
Optional **total**: harness recomputes mean of three process scores.
--- PROXY TRACE JSON BELOW ---
{payload}
"""
)Completion Grader
def _git(cwd: Path, *args: str, timeout: int = 60) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["git", "-C", str(cwd), *args],
capture_output=True,
text=True,
timeout=timeout,
check=False,
)
def _origin_to_bare_path(url: str) -> Path | None:
u = url.strip()
if u.startswith("file:"):
# file:///path or file:/path
from urllib.parse import unquote, urlparse
p = urlparse(u).path
if p.startswith("/") and len(p) > 2 and p[2] == ":":
p = p[1:]
return Path(unquote(p))
p = Path(u)
if p.is_absolute() and p.exists():
return p
return None
def score_workspace(workspace: Path) -> dict[str, Any]:
w = workspace.resolve()
weight = 0.25
checks: list[dict[str, Any]] = []
if not _GT.is_file():
return {
"task": "009-git-pr-merge",
"workspace": str(w),
"checks": [],
"outcome_score": 0.0,
"error": f"missing ground_truth.json: {_GT}",
}
gt = json.loads(_GT.read_text(encoding="utf-8"))
pr_branch = str(gt.get("pr_branch") or "feature/pr-add-doc")
marker = str(gt.get("approve_marker_in_contributing") or "BENCH_PR_OK")
review_rel = str(gt.get("review_relative_path") or "out/review.txt")
approve_needle = str(gt.get("approve_line_must_start_with") or "APPROVE")
if not (w / ".git").is_dir():
return {
"task": "009-git-pr-merge",
"workspace": str(w),
"checks": [],
"outcome_score": 0.0,
"error": "workspace is not a git clone (.git missing)",
}
gr = _git(w, "remote", "get-url", "origin")
bare: Path | None = None
if gr.returncode == 0 and gr.stdout.strip():
bare = _origin_to_bare_path(gr.stdout.strip())
if bare is None or not bare.is_dir():
checks.append(
{
"id": "origin_bare",
"label": "resolve origin to local bare repo path",
"pass": False,
"weight": weight,
"detail": gr.stdout.strip() if gr.returncode == 0 else gr.stderr.strip(),
}
)
outcome = 0.0
return {
"task": "009-git-pr-merge",
"workspace": str(w),
"checks": checks,
"outcome_score": outcome,
}
# 1) review.txt first meaningful line starts with APPROVE
review_path = w / Path(review_rel)
approve_ok = False
detail_review = None
if review_path.is_file():
try:
for line in review_path.read_text(encoding="utf-8", errors="replace").splitlines():
s = line.strip()
if not s:
continue
approve_ok = s.upper().startswith(approve_needle.upper())
if not approve_ok:
detail_review = f"first non-empty line: {s[:120]!r}"
break
else:
detail_review = "empty file"
except OSError as e:
detail_review = str(e)
else:
detail_review = "missing review file"
checks.append(
{
"id": "review_approve",
"label": f"{review_rel} first non-empty line starts with {approve_needle!r}",
"pass": approve_ok,
"weight": weight,
"detail": None if approve_ok else detail_review,
}
)
# 2) CONTRIBUTING.md on bare main contains marker
show = _git(bare, "show", "main:CONTRIBUTING.md")
contrib_ok = show.returncode == 0 and marker in (show.stdout or "")
checks.append(
{
"id": "bare_contributing",
"label": f"bare main:CONTRIBUTING.md contains {marker!r}",
"pass": contrib_ok,
"weight": weight,
"detail": None if contrib_ok else (show.stderr.strip() or show.stdout[:200]),
}
)
# 3) workspace main == bare main (pushed)
wm = _git(w, "rev-parse", "main")
bm = _git(bare, "rev-parse", "main")
push_ok = (
wm.returncode == 0
and bm.returncode == 0
and wm.stdout.strip() == bm.stdout.strip()
and len(wm.stdout.strip()) >= 7
)
checks.append(
{
"id": "main_synced",
"label": "local main SHA equals bare refs/heads/main (push ok)",
"pass": push_ok,
"weight": weight,
"detail": None
if push_ok
else f"ws={wm.stdout.strip()[:12]} bare={bm.stdout.strip()[:12]}",
}
)
# 4) PR branch tip is ancestor of main on bare (merged)
tip = _git(bare, "rev-parse", pr_branch)
anc = _git(bare, "merge-base", "--is-ancestor", tip.stdout.strip(), "main")
merge_ok = tip.returncode == 0 and anc.returncode == 0
checks.append(
{
"id": "pr_merged",
"label": f"bare: tip of {pr_branch!r} is ancestor of main",
"pass": merge_ok,
"weight": weight,
"detail": None if merge_ok else tip.stderr.strip() or anc.stderr.strip(),
}
)
outcome = round(sum(c["weight"] for c in checks if c["pass"]), 4)
return {
"task": "009-git-pr-merge",
"workspace": str(w),
"checks": checks,
"outcome_score": outcome,
}