Task 8 — Workspace, Tool Use & Multimodal Operations

Model Runs6 harnesses & 8 models evaluated on this task.

Loading...

PromptWorkspace, Tool Use & Multimodal Operations · Task 8

You are building a small offline file-standardization pipeline. Use only local files.

Input directory:

$WORKSPACE/in/raw/

Create:

$WORKSPACE/out/normalized/
$WORKSPACE/out/rename_log.csv
$WORKSPACE/out/error_report.csv

Supported transformations:

TXT "Key: Value" revenue files

Convert valid region/amount/currency records into JSON.
Output filename format: revenue_<yyyy>_<mm>.json.

Customer CSV exports

Convert valid rows to a JSON array.
Normalize keys to name,email,spend.
Spend must be numeric.
Skip bad rows and report them in error_report.csv.

Inventory JSON files

Convert the items array to CSV with columns sku,qty sorted by sku.
Output filename format: inventory_<warehouse>.csv.

Filename rules:

Recursively scan $WORKSPACE/in/raw/.
Supported extensions are .txt, .csv, and .json, case-insensitive.
Do not process unsupported files.
Normalize output filenames to lowercase snake_case.
If two supported inputs would produce the same output name, keep the first sorted source path as the base name and append __2, __3, etc. to later conflicting output names.

CSV log requirements:

rename_log.csv exact header: source,target,action
Include one row for every successfully transformed supported input.
Sort rows by source path.
Use workspace-relative paths such as in/raw/...

error_report.csv requirements:

Exact header: source,row_or_record,error_type,details
Include malformed supported records and unsupported files.
Unsupported files must be reported but not copied into out/normalized.

Rules:

Do not modify or delete input files.
Do not create extra files in $WORKSPACE/out/normalized/.
JSON outputs must be valid UTF-8 JSON.

Input Files8 files

in/raw/2024 FEB revenue.TXT

in/raw/Customer Export 01.csv

in/raw/README.keep

in/raw/bad revenue.txt

in/raw/image.png

in/raw/mixed-case inventory.JSON

in/raw/nested/2024 mar revenue.txt

in/raw/nested/customer export 01.csv

LLM Rubric

USER_TEMPLATE = """This task: You are building a small offline file-standardization pipeline. Use only local files. Input directory: - $WORKSPACE/in/raw/ Create: - $WORKSPACE/out/normalized/ - $WORKSPACE/out/rename_log.csv - $WORKSPACE/out/error_repo...

Key outputs: in/raw/, in/raw/., out/error_report.csv, out/normalized/, out/normalized/..

Evaluate the agent run:
- tool_use_appropriate: uses tools that match the task modality and files/APIs.
- consistency: follows the prompt requirements and writes the expected artifacts.
- robustness: handles missing directories, malformed inputs, retries, or local-server issues when relevant.

Return ONLY JSON with scores, security_gate, and notes.
"""

Completion Grader

def _sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()

def _add(checks: list[dict[str, Any]], cid: str, label: str, ok: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "label": label, "pass": ok, "weight": weight, "detail": None if ok else detail})

def _workspace_rel(rel: str) -> Path:
    path = Path(rel)
    if path.parts and path.parts[0] == "fixtures":
        return Path(*path.parts[1:])
    return path

def _normalize_source_row(row: dict[str, str]) -> dict[str, str]:
    out = dict(row)
    source = out.get("source", "")
    if source.startswith("fixtures/"):
        out["source"] = source[len("fixtures/"):]
    return out

def score_workspace(workspace: Path) -> dict[str, Any]:
    truth = json.loads((Path(__file__).resolve().parent / "ground_truth.json").read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    for rel, expected in truth["source_hashes"].items():
        path = workspace / _workspace_rel(rel)
        actual = _sha256(path) if path.is_file() else ""
        _add(checks, "source_" + Path(rel).name, f"{rel} unchanged", actual == expected, 1.0, f"got {actual!r}")

    normalized = workspace / "out" / "normalized"
    actual_names = sorted(p.name for p in normalized.iterdir()) if normalized.is_dir() else []
    expected_names = sorted(Path(p).name for p in truth["outputs"])
    _add(checks, "target_file_set", "normalized output file set is exact", actual_names == expected_names, 2.0, f"got {actual_names!r}")

    for rel, expected in truth["outputs"].items():
        path = workspace / rel
        if isinstance(expected, str):
            actual = path.read_text(encoding="utf-8", errors="replace") if path.is_file() else ""
            _add(checks, "content_" + path.name, f"{rel} exact content", actual == expected, 2.0, f"got {actual!r}")
        else:
            try:
                actual_json = json.loads(path.read_text(encoding="utf-8"))
            except Exception as exc:
                actual_json = None
                _add(checks, "json_" + path.name, f"{rel} valid JSON", False, 1.0, str(exc))
            else:
                _add(checks, "json_" + path.name, f"{rel} valid JSON", True, 1.0)
            _add(checks, "content_" + path.name, f"{rel} exact transformed data", actual_json == expected, 2.0, f"got {actual_json!r}")

    log_path = workspace / "out" / "rename_log.csv"
    try:
        with log_path.open("r", encoding="utf-8", newline="") as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            fieldnames = reader.fieldnames
    except Exception as exc:
        rows = []
        fieldnames = None
        _add(checks, "log_read", "rename_log.csv can be parsed", False, 1.0, str(exc))
    else:
        _add(checks, "log_read", "rename_log.csv can be parsed", True, 1.0)
    _add(checks, "log_header", "rename_log.csv header is exact", fieldnames == ["source", "target", "action"], 1.0, f"got {fieldnames!r}")
    expected_rows = [_normalize_source_row(row) for row in truth["rename_log_rows"]]
    _add(checks, "log_rows", "rename_log.csv rows are exact and sorted", rows == expected_rows, 3.0, f"got {rows!r}")

    err_path = workspace / "out" / "error_report.csv"
    try:
        with err_path.open("r", encoding="utf-8", newline="") as f:
            reader = csv.DictReader(f)
            err_rows = list(reader)
            err_fields = reader.fieldnames
    except Exception as exc:
        err_rows = []
        err_fields = None
        _add(checks, "error_report_read", "error_report.csv can be parsed", False, 1.0, str(exc))
    else:
        _add(checks, "error_report_read", "error_report.csv can be parsed", True, 1.0)
    _add(checks, "error_report_header", "error_report.csv header is exact", err_fields == ["source", "row_or_record", "error_type", "details"], 1.0, f"got {err_fields!r}")
    expected_errors = {(r["source"], r["row_or_record"], r["error_type"]) for r in truth["error_rows"]}
    actual_errors = {(r.get("source", ""), r.get("row_or_record", ""), r.get("error_type", "")) for r in err_rows}
    _add(checks, "error_report_rows", "error_report.csv reports malformed and unsupported inputs", expected_errors <= actual_errors, 3.0, f"got {err_rows!r}")

    score = sum(c["weight"] for c in checks if c["pass"]) / sum(c["weight"] for c in checks)
    return {"task": "021-batch-rename-transform", "workspace": str(workspace), "checks": checks, "outcome_score": score}

Batch rename and format transform