Model Runs6 harnesses & 8 models evaluated on this task.
Loading...
PromptKnowledge, Evidence & Retrieval · Task 3
Task: Offline Document QA with Evidence Attribution
You may only use the offline materials in $WORKSPACE/in/docs/*.md and $WORKSPACE/in/questions.json. Do not go online, and do not use common knowledge to fill in information not present in the materials.
Read all questions from questions.json and write the answers to $WORKSPACE/out/answers.json.
Output format:
- The top level must be a JSON array, each element corresponding to one question.
- Each element must include:
question_id,answer,source_file,quote_or_signal. - If there is insufficient evidence in the materials to answer the question,
answermust beinsufficient_evidence,source_filemust benullor an empty string, andquote_or_signalmust explain what type of evidence is missing. - For answerable questions,
source_filemust be a relative path underdocs/, e.g.,docs/operations.md. quote_or_signalmust provide a short quote or locating signal that supports the answer; do not write only “see document”.
Prohibitions:
- Do not modify any files in fixtures.
- Do not output extra Markdown wrapping.
- Do not fabricate answers for unanswerable questions.
Input Files4 files
in/docs/operations.md
in/docs/rivergate.md
in/docs/security.md
in/questions.json
LLM Rubric
USER_TEMPLATE = (
"Task name: {task_name}\n\n"
+ _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
+ "\n\n"
+ _dr.RUBRIC_USER_OUTCOME_NOTICE
+ "\n\nEvaluate offline document QA. The agent should inspect docs/questions, produce out/answers.json, cite source_file and quote_or_signal, and mark unsupported questions as insufficient_evidence.\n\n"
"Score tool_use_appropriate, consistency, robustness from 0.0-1.0 (process only).\n\n"
+ "\n\n"
+ _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
+ "\n\n"
+ _dr.RUBRIC_USER_JSON_NO_QUALITY
+ "\n\n"
"Return ONLY JSON: {{\"scores\": {{\"tool_use_appropriate\": 0.0, \"consistency\": 0.0, \"robustness\": 0.0}}, \"security_gate\": 1, \"notes\": \"one line\"}}\n\n"
"--- PROXY TRACE JSON BELOW ---\n{payload}"
)Completion Grader
def _load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
# def _norm(value: Any) -> str:
# return str(value or "").strip().lower()
def _norm(value: Any) -> str:
s = str(value or "").strip().lower()
# Normalize time range format: transform "hh:mm-hh:mm utc" or "utc hh:mm-hh:mm" to standard
s = re.sub(r'(\d{2}:\d{2})-(\d{2}:\d{2})\s*utc', r'\1-\2 utc', s, flags=re.I)
s = re.sub(r'utc\s*(\d{2}:\d{2})-(\d{2}:\d{2})', r'\1-\2 utc', s, flags=re.I)
return s
def score_workspace(workspace: Path) -> dict[str, Any]:
w = Path(workspace).resolve()
gt = _load_json(TASK_DIR / "ground_truth.json")
out_path = w / "out" / "answers.json"
checks: list[dict[str, Any]] = []
answers_score = 0.0
source_score = 0.0
insufficient_score = 0.0
format_score = 0.0
if not out_path.is_file():
return {"task": "033-offline-knowledge-qa", "outcome_score": 0.0, "level": "fail", "checks": [{"id": "answers_missing", "pass": False, "weight": 1.0, "detail": "out/answers.json missing"}]}
try:
data = _load_json(out_path)
rows = data if isinstance(data, list) else data.get("answers", [])
by_id = {str(row.get("question_id")): row for row in rows if isinstance(row, dict)}
format_ok = isinstance(rows, list) and all({"question_id", "answer", "source_file", "quote_or_signal"}.issubset(row) for row in rows if isinstance(row, dict))
format_score = 1.0 if format_ok and len(by_id) == len(gt["answers"]) else 0.0
checks.append({"id": "format", "label": "answers.json is a complete JSON array with required fields", "pass": bool(format_score), "weight": 0.15, "detail": {"rows": len(by_id)}})
fact_hits = 0
source_hits = 0
quote_hits = 0
insufficient_hits = 0
for qid, exp in gt["answers"].items():
row = by_id.get(qid, {})
answer = _norm(row.get("answer"))
source = str(row.get("source_file") or "")
quote = _norm(row.get("quote_or_signal"))
if exp.get("insufficient"):
if answer == "insufficient_evidence":
insufficient_hits += 1
if not source and exp["missing_signal"] in quote:
quote_hits += 1
else:
if all(_norm(fact) in answer for fact in exp["facts"]):
fact_hits += 1
if source == exp["source_file"]:
source_hits += 1
if all(_norm(token) in quote for token in exp["quote_tokens"]):
quote_hits += 1
answerable_count = sum(1 for exp in gt["answers"].values() if not exp.get("insufficient"))
answers_score = fact_hits / answerable_count
source_score = (source_hits + quote_hits) / (answerable_count * 2 + 1)
insufficient_score = insufficient_hits
checks.append({"id": "facts", "label": "answer facts match offline documents", "pass": answers_score >= 0.75, "weight": 0.35, "detail": {"fact_hits": fact_hits, "answerable": answerable_count}})
checks.append({"id": "evidence", "label": "source_file and quote_or_signal support answers", "pass": source_score >= 0.70, "weight": 0.30, "detail": {"source_hits": source_hits, "quote_hits": quote_hits}})
checks.append({"id": "insufficient", "label": "unanswerable question marked insufficient_evidence", "pass": insufficient_score == 1, "weight": 0.20, "detail": {"hits": insufficient_hits}})
except Exception as exc:
checks.append({"id": "parse_error", "label": "answers.json parseable", "pass": False, "weight": 1.0, "detail": str(exc)})
total = 0.15 * format_score + 0.35 * answers_score + 0.30 * source_score + 0.20 * insufficient_score
th = gt["scoring"]["thresholds"]
level = "excellent" if total >= th["excellent"] else "good" if total >= th["good"] else "pass" if total >= th["pass"] else "fail"
return {"task": "033-offline-knowledge-qa", "workspace": str(w), "outcome_score": round(float(total), 4), "level": level, "checks": checks}