Excel-like sales table cleaning

You are cleaning a messy sales export for a finance reconciliation review. Use only local files.

Data, BI & Finance AnalyticsTask 1Oracle + LLM scoring

Task ID049-excel-like-cleaning

DifficultyEasy

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptData, BI & Finance Analytics · Task 1

You are cleaning a messy sales export for a finance reconciliation review. Use only local files.

Inputs copied to the workspace root:

$WORKSPACE/in/messy_sales.csv
$WORKSPACE/in/customer_dictionary.csv
$WORKSPACE/in/fx_rates.csv

Create exactly these outputs:

$WORKSPACE/out/cleaned_sales.csv
$WORKSPACE/out/reject_ledger.csv
$WORKSPACE/out/reject_summary.json
$WORKSPACE/out/cleaning_report.md

Cleaning rules:

Do not modify input files.
Validate customer_id against customer_dictionary.csv. Reject rows whose customer_id is unknown or inactive.
Parse order_date using the row locale:

en_US dates are month/day/year.
en_GB and de_DE dates are day/month/year.
ISO dates are already year-month-day.
Dates with month names such as "10-Jan-2025" are valid.

Normalize status to lowercase. Refund/returned rows must keep a negative amount_usd.
Normalize amount to USD using fx_rates.csv. The amount field is already the row total, not a unit price. Strip currency symbols and accounting parentheses. For en_US/en_GB, comma is the thousands separator and dot is the decimal separator. For de_DE, dot is the thousands separator and comma is the decimal separator. Round amount_usd to exactly two decimals.
Quantity must be a positive integer.
Deduplicate by order_id after validation. Keep the first valid occurrence in input order and reject later valid duplicates with reason duplicate_order_id. Invalid rows are rejected for their validation reason rather than duplicate_order_id.
Reject rows with invalid dates, unsupported currency, missing/non-numeric amounts, invalid quantities, unknown/inactive customers, or duplicates.
If a row has multiple reject reasons, record the first applicable reason in this priority order:

missing_amount, unsupported_currency, invalid_quantity, inactive_customer, unknown_customer, invalid_date, duplicate_order_id.

Refund/returned rows must produce a negative amount_usd exactly once: a positive returned amount becomes negative, and an already-negative returned amount stays negative.

cleaned_sales.csv requirements:

Exact header:

order_id,order_date,customer_id,customer_name,sku,quantity,amount_usd,status

Rows must be sorted by order_date then order_id.
Do not include extra columns.

reject_ledger.csv requirements:

Exact header:

order_id,reason,source_row,notes

Include one row per rejected input row. source_row is 1-based data-row number from messy_sales.csv, excluding the header.
Use one of the reason codes listed in rule 9.

reject_summary.json requirements:

Valid JSON object keyed by every reason code in rule 9, including zero-count reasons if any.
Each reason entry must contain count, source_rows, and order_ids.
source_rows must be numeric arrays in ascending input order.

cleaning_report.md requirements:

Summarize valid row count, rejected row count, duplicate count, and total amount_usd.
Explain each reject reason category.
Mention any locale or FX assumptions used.

Do not use network access or external APIs.

Input Files3 files

in/customer_dictionary.csv

in/fx_rates.csv

in/messy_sales.csv

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\nEvaluate whether the agent used a reproducible local workflow across messy_sales.csv, customer_dictionary.csv, "
    "and fx_rates.csv; preserved fixtures; reasoned about locale dates, FX conversion, refunds, duplicate handling, "
    "and reject-ledger categories; and avoided unsupported assumptions. Deterministic artifact correctness is handled "
    "by oracle_grade.py; focus on process quality and report clarity.\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON:\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, "security_gate": 1, "notes": "one line"}}\n\n'
    "Optional **total**: harness recomputes mean of three process scores.\n\n--- PROXY TRACE JSON BELOW ---\n{payload}"
)

Completion Grader

def _check(checks: list[dict[str, Any]], cid: str, passed: bool, weight: float, detail: str | None = None) -> None:
    checks.append({"id": cid, "pass": bool(passed), "weight": weight, "detail": None if passed else detail})


def _read_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]:
    with path.open(newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return list(reader.fieldnames or []), [{k: (v or "").strip() for k, v in row.items()} for row in reader]


def _fixtures_unchanged(workspace: Path) -> bool:
    fixture_dir = _TASK_DIR / "fixtures" / "in"
    for original in fixture_dir.iterdir():
        candidate = workspace / "in" / original.name
        if not candidate.is_file():
            return False
        if hashlib.sha256(candidate.read_bytes()).hexdigest() != hashlib.sha256(original.read_bytes()).hexdigest():
            return False
    return True


def _has_report_terms(text: str, terms: list[str]) -> bool:
    aliases = {
        "valid row count": ("valid row count", "valid rows"),
        "rejected row count": ("rejected row count", "rejected rows"),
        "duplicate": ("duplicate", "duplicates", "duplicate rows"),
        "invalid_date": ("invalid_date", "invalid date", "invalid dates"),
        "unsupported_currency": ("unsupported_currency", "unsupported currency", "unsupported currencies"),
    }
    for term in terms:
        options = aliases.get(term, (term,))
        if not any(option in text for option in options):
            return False
    return True


def score_workspace(workspace: str | Path, *, ground_truth_path: Path | None = None) -> dict[str, Any]:
    w = Path(workspace).resolve()
    gt = json.loads((ground_truth_path or _DEFAULT_GT).read_text(encoding="utf-8"))
    checks: list[dict[str, Any]] = []

    _check(checks, "fixtures_unchanged", _fixtures_unchanged(w), 0.08, "one or more input files are missing or modified")
    for name, expected_count in gt["input_counts"].items():
        fixture = w / "in" / f"{name}.csv"
        try:
            _, fixture_rows = _read_csv(fixture)
            _check(checks, f"fixture_{name}_row_count", len(fixture_rows) == expected_count, 0.02, f"{name}.csv row count changed")
        except Exception as exc:
            _check(checks, f"fixture_{name}_row_count", False, 0.02, str(exc))

    out_csv = w / gt["outputs"]["cleaned_csv"]
    report = w / gt["outputs"]["report"]
    _check(checks, "cleaned_csv_exists", out_csv.is_file(), 0.08, "missing out/cleaned_sales.csv")
    reject_path = w / gt["outputs"]["reject_ledger"]
    reject_summary_path = w / gt["outputs"].get("reject_summary", "out/reject_summary.json")
    _check(checks, "report_exists", report.is_file(), 0.04, "missing out/cleaning_report.md")
    _check(checks, "reject_ledger_exists", reject_path.is_file(), 0.06, "missing out/reject_ledger.csv")
    _check(checks, "reject_summary_exists", reject_summary_path.is_file(), 0.06, "missing out/reject_summary.json")

    rows: list[dict[str, str]] = []
    if out_csv.is_file():
        try:
            header, rows = _read_csv(out_csv)
            _check(checks, "exact_header", header == gt["cleaned_header"], 0.08, f"got {header}")
            _check(checks, "exact_cleaned_rows", rows == gt["cleaned_rows"], 0.30, f"got {rows}")
            _check(checks, "iso_dates", all(re.fullmatch(r"\d{4}-\d{2}-\d{2}", r.get("order_date", "")) for r in rows), 0.05, "dates must be YYYY-MM-DD")
            _check(checks, "amount_decimals", all(re.fullmatch(r"-?\d+\.\d{2}", r.get("amount_usd", "")) for r in rows), 0.05, "amounts must have two decimals")
            _check(checks, "locale_fx_effects", any(r.get("amount_usd") == "21.45" for r in rows) and any(r.get("amount_usd") == "-19.99" for r in rows), 0.08, "locale, FX, or refund handling missing")
            by_order = {r.get("order_id", ""): r for r in rows}
            _check(checks, "de_de_decimal_amount", by_order.get("S1020", {}).get("amount_usd") == "1358.02", 0.06, "de_DE decimal/thousands parsing is wrong")
            _check(checks, "us_thousands_amount", by_order.get("S1021", {}).get("amount_usd") == "1234.56", 0.04, "en_US thousands parsing is wrong")
            _check(checks, "refund_sign_once", by_order.get("S1022", {}).get("amount_usd") == "-11.00" and by_order.get("S1023", {}).get("amount_usd") == "-11.00", 0.06, "returned/refunded sign handling is wrong")
            clean_ids = {r.get("order_id") for r in rows}
            _check(checks, "no_duplicate_clean_rows", len(clean_ids) == len(rows), 0.05, "duplicate order_id in cleaned rows")
        except Exception as exc:
            _check(checks, "csv_readable", False, 0.10, str(exc))
    else:
        _check(checks, "de_de_decimal_amount", False, 0.06, "missing out/cleaned_sales.csv")
        _check(checks, "us_thousands_amount", False, 0.04, "missing out/cleaned_sales.csv")
        _check(checks, "refund_sign_once", False, 0.06, "missing out/cleaned_sales.csv")

    if reject_path.is_file():
        try:
            reject_header, reject_rows = _read_csv(reject_path)
            _check(checks, "reject_header", reject_header == gt["reject_header"], 0.06, f"got {reject_header}")
            expected_rejects = sorted(gt["reject_rows"], key=lambda r: int(r["source_row"]))
            got_slim = sorted(
                [{"order_id": r.get("order_id", ""), "reason": r.get("reason", ""), "source_row": r.get("source_row", "")} for r in reject_rows],
                key=lambda r: int(r.get("source_row") or 0),
            )
            notes_ok = all(r.get("notes", "").strip() for r in reject_rows)
            _check(checks, "reject_reasons", got_slim == expected_rejects and notes_ok, 0.14, f"got {got_slim}")
        except Exception as exc:
            _check(checks, "reject_readable", False, 0.10, str(exc))
    else:
        _check(checks, "reject_header", False, 0.06, "missing out/reject_ledger.csv")
        _check(checks, "reject_reasons", False, 0.14, "missing out/reject_ledger.csv")

    if reject_summary_path.is_file():
        try:
            summary = json.loads(reject_summary_path.read_text(encoding="utf-8"))
            expected = gt.get("reject_summary_expected", {})
            summary_ok = isinstance(summary, dict) and set(summary) == set(expected)
            for reason, exp in expected.items():
                got = summary.get(reason, {}) if isinstance(summary, dict) else {}
                summary_ok = summary_ok and int(got.get("count", -1)) == int(exp["count"])
                summary_ok = summary_ok and [int(x) for x in got.get("source_rows", [])] == exp["source_rows"]
                summary_ok = summary_ok and list(got.get("order_ids", [])) == exp["order_ids"]
            _check(checks, "reject_summary_exact", summary_ok, 0.12, f"got {summary}")
        except Exception as exc:
            _check(checks, "reject_summary_exact", False, 0.12, str(exc))
    else:
        _check(checks, "reject_summary_exact", False, 0.12, "missing out/reject_summary.json")

    if report.is_file():
        text = report.read_text(encoding="utf-8", errors="replace").lower()
        reject_categories = {row["reason"] for row in gt["reject_rows"]}
        _check(checks, "report_reject_categories", all(reason.lower() in text for reason in reject_categories), 0.18, "not all reject categories are explained")
        _check(checks, "report_terms", _has_report_terms(text, gt["required_report_terms"]), 0.07, "missing required summary terms")
    else:
        _check(checks, "report_reject_categories", False, 0.18, "missing out/cleaning_report.md")
        _check(checks, "report_terms", False, 0.07, "missing out/cleaning_report.md")

    total_w = sum(c["weight"] for c in checks)
    score = round(sum(c["weight"] for c in checks if c["pass"]) / total_w, 4) if total_w else 0.0
    if any(c["id"] == "exact_cleaned_rows" and not c["pass"] for c in checks):
        score = min(score, 0.69)
    return {"task": "049-excel-like-cleaning", "workspace": str(w), "checks": checks, "outcome_score": score, "score": score}

Task 2 →