Image Editing: Style Transfer And Scene Extension

Task: image editing.

Workspace, Tool Use & Multimodal OperationsTask 6Oracle + LLM scoring

Task ID013-image-edit

DifficultyHard

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptWorkspace, Tool Use & Multimodal Operations · Task 6

Task: image editing

You have a photo of a cat at $WORKSPACE/in/cat.jpg.

Using an image-generation tool (if available) or other means, produce two new images:

Style transfer: render the cat in a clearly different art style (e.g. cartoon, oil, watercolor, pixel art).

Output: $WORKSPACE/out/cat_styled.png

Scene change: place the cat in a new background/scene (e.g. moon surface, beach, space, study), keeping the cat recognizable.

Output: $WORKSPACE/out/cat_scene.png

Also write $WORKSPACE/out/description.txt briefly covering:

which style/scene you used for each image
tools or prompts used (if any)

After all three files exist, end the conversation immediately.

Autograding focuses on:

both outputs exist and differ clearly from the source
style transfer visibly changes style
scene image changes background while preserving the cat
description.txt is substantive

If cat_styled.png, cat_scene.png, and description.txt are present and meet the brief, stop—no extra tool chatter.

Input Files1 file

in/cat.jpg

Hooks

def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
    workspace = Path(context["workspace"])
    (workspace / "out").mkdir(parents=True, exist_ok=True)
    # 提示图像生成工具（如果存在）
    return {
        "IMAGE_GEN_AVAILABLE": "true",
        "EXAMPLE_STYLE": "cartoon"
    }

def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
    return runtime_state

def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
    pass

LLM Rubric

USER_TEMPLATE = """This task: Task: image editing You have a photo of a cat at `$WORKSPACE/in/cat.jpg`. Using an image-generation tool (if available) or other means, produce two new images: 1. **Style transfer:** render the cat in a clearly different...

Key outputs: in/cat.jpg, out/cat_scene.png, out/cat_styled.png, out/description.txt.

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""

Completion Grader

def _to_native(obj):
    if hasattr(obj, "item"):
        return obj.item()
    if isinstance(obj, dict):
        return {k: _to_native(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_to_native(v) for v in obj]
    return obj

def _image_hash(image_path: Path) -> str:
    """计算图片的简单哈希（用于判断是否修改）"""
    try:
        # 使用 PIL 如果可用，否则回退到文件内容哈希
        from PIL import Image
        import numpy as np
        img = Image.open(image_path).resize((128, 128)).convert("L")
        pixels = np.array(img).flatten()
        # 取前 256 个像素的平均值作为简单指纹
        avg = int(np.mean(pixels))
        return f"{avg}_{hashlib.md5(img.tobytes()).hexdigest()[:8]}"
    except ImportError:
        # 没有 PIL，回退到文件内容哈希
        return hashlib.md5(image_path.read_bytes()).hexdigest()[:16]
    except Exception:
        return "error"

def _description_excerpt_for_quality(w: Path, max_chars: int = 1800) -> str:
    p = w / "out" / "description.txt"
    if not p.is_file():
        return "(description.txt missing)"
    t = p.read_text(encoding="utf-8", errors="replace").strip()
    return t[:max_chars] + ("…[truncated]" if len(t) > max_chars else "")

def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    out_dir = w / "out"
    in_dir = w / "in"
    task_dir = w.parent.parent
    gt_path = task_dir / "ground_truth.json"
    gt = json.loads(gt_path.read_text(encoding="utf-8")) if gt_path.exists() else {}

    required = gt.get("required_images", ["cat_styled.png", "cat_scene.png"])
    min_size = gt.get("min_file_size_bytes", 5000)

    checks = []
    file_score = 0.0
    modified_score = 0.0
    description_score = 0.0

    # 1. 文件存在性 (40%)
    existing = []
    for fname in required:
        fpath = out_dir / fname
        exists = fpath.exists() and fpath.stat().st_size >= min_size
        existing.append(exists)
        checks.append({
            "id": f"file_{fname}",
            "label": f"{fname} 存在且大小≥{min_size}B",
            "pass": exists,
            "weight": 0.40 / len(required),
            "detail": {"size": fpath.stat().st_size if fpath.exists() else 0}
        })
    file_score = sum(existing) / len(required)

    # 2. 图片是否明显不同于原图 (40%)
    original = in_dir / "cat.jpg"
    if original.exists() and file_score > 0:
        # 对每张输出图片，检查与原图的哈希差异
        original_hash = _image_hash(original)
        modified_count = 0
        for idx, fname in enumerate(required):
            fpath = out_dir / fname
            if fpath.exists():
                new_hash = _image_hash(fpath)
                # 如果哈希不同，认为有修改（简单启发）
                if new_hash != original_hash and new_hash != "error":
                    modified_count += 1
                else:
                    # 即使哈希相同，如果文件大小差异很大也可能修改，但简化
                    pass
        modified_score = modified_count / len(required)
        checks.append({
            "id": "image_modification",
            "label": f"图片与原图差异检测: {modified_count}/{len(required)} 不同",
            "pass": modified_score >= 0.5,
            "weight": 0.40,
            "detail": {"different": modified_count, "total": len(required)}
        })
    else:
        checks.append({
            "id": "image_modification",
            "label": "无法检测修改（原图缺失或输出文件缺失）",
            "pass": False,
            "weight": 0.40,
            "detail": None
        })

    # 3. 描述文件质量 (20%)
    desc_file = out_dir / "description.txt"
    if desc_file.exists():
        content = desc_file.read_text(encoding="utf-8").strip()
        # 检查是否包含关键词
        has_style = any(k in content.lower() for k in ["style", "cartoon", "oil", "watercolor", "pixel", "painting"])
        has_scene = any(k in content.lower() for k in ["scene", "background", "moon", "beach", "space", "setting"])
        if has_style and has_scene:
            description_score = 1.0
        elif has_style or has_scene:
            description_score = 0.5
        else:
            description_score = 0.2
        checks.append({
            "id": "description",
            "label": "description.txt 包含风格和场景说明",
            "pass": description_score >= 0.5,
            "weight": 0.20,
            "detail": {"has_style": has_style, "has_scene": has_scene}
        })
    else:
        checks.append({
            "id": "description",
            "label": "description.txt 缺失",
            "pass": False,
            "weight": 0.20,
            "detail": None
        })

    total_score = file_score * 0.40 + modified_score * 0.40 + description_score * 0.20
    thresholds = gt.get("scoring", {}).get("thresholds", {"excellent": 0.90, "good": 0.75, "pass": 0.60})
    if total_score >= thresholds["excellent"]:
        level = "excellent"
    elif total_score >= thresholds["good"]:
        level = "good"
    elif total_score >= thresholds["pass"]:
        level = "pass"
    else:
        level = "fail"

    q_meta: dict[str, Any] = {}
    ql: float | None = None
    try:
        from harnessbench.grading.oracle_quality_llm import run_oracle_quality_llm
        from harnessbench.grading.rubric_llm import build_workspace_image_attachment

        rel_outputs = [f"out/{n}" for n in required]
        user_text = (
            "Task artifacts under workspace/out (images attached when present).
"
            "### description.txt excerpt
"
            + _description_excerpt_for_quality(w)
            + "
Judge whether BOTH edited PNGs satisfy style vs scene requirements and prose matches."
        )
        user_mc = build_workspace_image_attachment(w, rel_outputs, user_text)
        ql, q_meta = run_oracle_quality_llm(system=QUALITY_SYSTEM, user=user_mc)
    except Exception as e:
        q_meta = {"skipped": False, "error": repr(e), "notes": "oracle quality LLM failed"}

    result = {
        "task": "013-image-edit",
        "workspace": str(w),
        "outcome_score": round(float(total_score), 4),
        "level": level,
        "checks": _to_native(checks),
        "summary": {
            "files_exist": round(float(file_score), 4),
            "images_modified": round(float(modified_score), 4),
            "description_quality": round(float(description_score), 4),
        },
        "needs_human_review": total_score >= 0.6 and total_score < 0.9,  # 自动通过但可能需要人工复核
        "auto_grade_only": True,
        "quality_rubric_meta": q_meta,
    }
    if ql is not None:
        result["quality"] = ql
    return json.loads(json.dumps(result, default=str))

def score_workspace_safe(workspace: Path) -> dict[str, Any]:
    try:
        return score_workspace(workspace)
    except Exception as e:
        return {
            "task": "013-image-edit",
            "workspace": str(workspace),
            "outcome_score": 0.0,
            "level": "error",
            "error": str(e),
            "checks": [],
            "summary": {},
        }

← Task 5 Task 7 →