Offline Knowledge Base QA with Evidence Attribution

Task: Offline Document QA with Evidence Attribution

Knowledge, Evidence & RetrievalTask 3Oracle + LLM scoring

Task ID033-offline-knowledge-qa

DifficultyEasy

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptKnowledge, Evidence & Retrieval · Task 3

Task: Offline Document QA with Evidence Attribution

You may only use the offline materials in $WORKSPACE/in/docs/*.md and $WORKSPACE/in/questions.json. Do not go online, and do not use common knowledge to fill in information not present in the materials.

Read all questions from questions.json and write the answers to $WORKSPACE/out/answers.json.

Output format:

The top level must be a JSON array, each element corresponding to one question.
Each element must include: question_id, answer, source_file, quote_or_signal.
If there is insufficient evidence in the materials to answer the question, answer must be insufficient_evidence, source_file must be null or an empty string, and quote_or_signal must explain what type of evidence is missing.
For answerable questions, source_file must be a relative path under docs/, e.g., docs/operations.md.
quote_or_signal must provide a short quote or locating signal that supports the answer; do not write only “see document”.

Prohibitions:

Do not modify any files in fixtures.
Do not output extra Markdown wrapping.
Do not fabricate answers for unanswerable questions.

Input Files4 files

in/docs/operations.md

in/docs/rivergate.md

in/docs/security.md

in/questions.json

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\nEvaluate offline document QA. The agent should inspect docs/questions, produce out/answers.json, cite source_file and quote_or_signal, and mark unsupported questions as insufficient_evidence.\n\n"
    "Score tool_use_appropriate, consistency, robustness from 0.0-1.0 (process only).\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    "Return ONLY JSON: {{\"scores\": {{\"tool_use_appropriate\": 0.0, \"consistency\": 0.0, \"robustness\": 0.0}}, \"security_gate\": 1, \"notes\": \"one line\"}}\n\n"
    "--- PROXY TRACE JSON BELOW ---\n{payload}"
)

Completion Grader

def _load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


# def _norm(value: Any) -> str:
#     return str(value or "").strip().lower()

def _norm(value: Any) -> str:
    s = str(value or "").strip().lower()
    # Normalize time range format: transform "hh:mm-hh:mm utc" or "utc hh:mm-hh:mm" to standard
    s = re.sub(r'(\d{2}:\d{2})-(\d{2}:\d{2})\s*utc', r'\1-\2 utc', s, flags=re.I)
    s = re.sub(r'utc\s*(\d{2}:\d{2})-(\d{2}:\d{2})', r'\1-\2 utc', s, flags=re.I)
    return s



def score_workspace(workspace: Path) -> dict[str, Any]:
    w = Path(workspace).resolve()
    gt = _load_json(TASK_DIR / "ground_truth.json")
    out_path = w / "out" / "answers.json"
    checks: list[dict[str, Any]] = []
    answers_score = 0.0
    source_score = 0.0
    insufficient_score = 0.0
    format_score = 0.0

    if not out_path.is_file():
        return {"task": "033-offline-knowledge-qa", "outcome_score": 0.0, "level": "fail", "checks": [{"id": "answers_missing", "pass": False, "weight": 1.0, "detail": "out/answers.json missing"}]}

    try:
        data = _load_json(out_path)
        rows = data if isinstance(data, list) else data.get("answers", [])
        by_id = {str(row.get("question_id")): row for row in rows if isinstance(row, dict)}
        format_ok = isinstance(rows, list) and all({"question_id", "answer", "source_file", "quote_or_signal"}.issubset(row) for row in rows if isinstance(row, dict))
        format_score = 1.0 if format_ok and len(by_id) == len(gt["answers"]) else 0.0
        checks.append({"id": "format", "label": "answers.json is a complete JSON array with required fields", "pass": bool(format_score), "weight": 0.15, "detail": {"rows": len(by_id)}})

        fact_hits = 0
        source_hits = 0
        quote_hits = 0
        insufficient_hits = 0
        for qid, exp in gt["answers"].items():
            row = by_id.get(qid, {})
            answer = _norm(row.get("answer"))
            source = str(row.get("source_file") or "")
            quote = _norm(row.get("quote_or_signal"))
            if exp.get("insufficient"):
                if answer == "insufficient_evidence":
                    insufficient_hits += 1
                if not source and exp["missing_signal"] in quote:
                    quote_hits += 1
            else:
                if all(_norm(fact) in answer for fact in exp["facts"]):
                    fact_hits += 1
                if source == exp["source_file"]:
                    source_hits += 1
                if all(_norm(token) in quote for token in exp["quote_tokens"]):
                    quote_hits += 1
        answerable_count = sum(1 for exp in gt["answers"].values() if not exp.get("insufficient"))
        answers_score = fact_hits / answerable_count
        source_score = (source_hits + quote_hits) / (answerable_count * 2 + 1)
        insufficient_score = insufficient_hits
        checks.append({"id": "facts", "label": "answer facts match offline documents", "pass": answers_score >= 0.75, "weight": 0.35, "detail": {"fact_hits": fact_hits, "answerable": answerable_count}})
        checks.append({"id": "evidence", "label": "source_file and quote_or_signal support answers", "pass": source_score >= 0.70, "weight": 0.30, "detail": {"source_hits": source_hits, "quote_hits": quote_hits}})
        checks.append({"id": "insufficient", "label": "unanswerable question marked insufficient_evidence", "pass": insufficient_score == 1, "weight": 0.20, "detail": {"hits": insufficient_hits}})
    except Exception as exc:
        checks.append({"id": "parse_error", "label": "answers.json parseable", "pass": False, "weight": 1.0, "detail": str(exc)})

    total = 0.15 * format_score + 0.35 * answers_score + 0.30 * source_score + 0.20 * insufficient_score
    th = gt["scoring"]["thresholds"]
    level = "excellent" if total >= th["excellent"] else "good" if total >= th["good"] else "pass" if total >= th["pass"] else "fail"
    return {"task": "033-offline-knowledge-qa", "workspace": str(w), "outcome_score": round(float(total), 4), "level": level, "checks": checks}

← Task 2 Task 4 →