Multi-Document Synthesis And Contradiction Detection

Task: multi-document synthesis and contradiction detection

Knowledge, Evidence & RetrievalTask 1Oracle + LLM scoring

Task ID012-doc-synthesis

DifficultyMedium

Tags

Model Runs6 harnesses & 8 models evaluated on this task.

PromptKnowledge, Evidence & Retrieval · Task 1

Task: multi-document synthesis and contradiction detection

You have five documents about the same incident under $WORKSPACE/in/:

doc1_news_site_a.md (NewsSite A article)
doc2_news_site_b.md (NewsSite B article)
doc3_social_media.md (social posts / rumors)
doc4_official_statement.md (official statement)
doc5_expert_analysis.md (expert analysis)

They conflict and may contain errors or bias.

Complete the following; write everything under $WORKSPACE/out/:

Per-document trust scores (trustworthiness.json)

Score each document 0–10 with a short rationale (e.g. source authority, internal consistency, evidence chain). Shape: { "doc1_news_site_a.md": {"score": 6, "reason": "specific numbers but sensational tone"}, ... }

Contradictions (contradictions.json)

List key contradictions (at least 3), citing which documents are involved and representative quotes/snippets. Shape: { "contradictions": [ { "claim": "example axis of disagreement", "documents": ["doc1", "doc2", "doc4"], "quotes": ["...", "...", "..."], "resolution": "your best-supported resolution" } ] }

Final narrative (final_report.md)

Most credible account based on cross-checking (about 1200–3500 characters of prose in English).
An Uncertainty / disputes section.
Evidence pointers using [filename] tags.

Scoring emphasizes reasonable trust scores, thorough contradiction coverage, and a logical, evidence-backed final report.

Input Files5 files

in/doc1_news_site_a.md

in/doc2_news_site_b.md

in/doc3_social_media.md

in/doc4_official_statement.md

in/doc5_expert_analysis.md

Hooks

def prepare_runtime(context: dict[str, Any]) -> dict[str, Any]:
    """可注入一些提示变量到 prompt"""
    workspace = Path(context["workspace"])
    # 可选：创建 out 目录
    (workspace / "out").mkdir(parents=True, exist_ok=True)
    return {
        "TASK_DEADLINE": "2 hours",
        "MIN_CONTRADICTIONS": "3"
    }


def after_round(context: dict[str, Any], runtime_state: dict[str, Any], adapter_result: Any) -> dict[str, Any]:
    """单轮任务，无需特殊处理"""
    return runtime_state


def cleanup_runtime(context: dict[str, Any], runtime_state: dict[str, Any]) -> None:
    pass

LLM Rubric

USER_TEMPLATE = (
    "Task name: {task_name}\n\n"
    + _dr.RUBRIC_IGNORE_BOOTSTRAP_READS
    + "\n\n"
    + _dr.RUBRIC_USER_OUTCOME_NOTICE
    + "\n\n"
    "Task type: synthesize multiple sources, judge credibility, detect contradictions, produce structured reports.\n\n"
    "Score these three dimensions (each 0.0–1.0); keys must match exactly:\n\n"
    "- **tool_use_appropriate**: Suitable tools to read/search documents and write outputs; matches multi-source comparison.\n\n"
    "- **consistency**: Explore before synthesizing; clear reasoning chain from sources to conclusions; avoid chaotic jumps or useless repeats.\n\n"
    "- **robustness**: High score if no failures; if reads/tools fail, recover, reroute, or state limits.\n\n"
    + "\n\n"
    + _dr.RUBRIC_USER_SECURITY_GATE_DETAIL
    + "\n\n"
    + _dr.RUBRIC_USER_JSON_NO_QUALITY
    + "\n\n"
    + "Return ONLY JSON (no markdown fences):\n"
    '{{"scores": {{"tool_use_appropriate": 0.0, "consistency": 0.0, "robustness": 0.0}}, '
    '"security_gate": 1, "notes": "one line"}}\n'
    "Optional **total**: Harness may recompute the mean.\n\n"
    "--- PROXY TRACE JSON BELOW ---\n"
    "{payload}"
)

Completion Grader

def _to_native(obj: Any) -> Any:
    """递归转换 numpy/pandas 类型为原生 Python 类型"""
    if hasattr(obj, "item"):
        return obj.item()
    if isinstance(obj, dict):
        return {k: _to_native(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_to_native(v) for v in obj]
    if isinstance(obj, (bool, int, float, str)):
        return obj
    return str(obj)  # fallback


def score_workspace(workspace: Path) -> dict[str, Any]:
    w = workspace.resolve()
    out_dir = w / "out"
    task_dir = w.parent.parent
    gt_path = task_dir / "ground_truth.json"
    gt = json.loads(gt_path.read_text(encoding="utf-8")) if gt_path.exists() else {}

    checks = []
    trust_score = 0.0
    contra_score = 0.0
    report_score = 0.0

    # ---------- 1. 可信度评估 (25%) ----------
    trust_file = out_dir / "trustworthiness.json"
    if trust_file.exists():
        try:
            data = json.loads(trust_file.read_text(encoding="utf-8"))
            expected = gt.get("expected_trust_scores", {})
            diff_sum = 0.0
            for doc, exp in expected.items():
                actual = data.get(doc, {}).get("score", 0)
                diff_sum += abs(actual - exp) / 10.0
            accuracy = 1.0 - (diff_sum / len(expected)) if expected else 0.0
            trust_score = max(0.0, min(1.0, accuracy))
            checks.append({
                "id": "trust_assessment",
                "label": "可信度评估准确度",
                "pass": bool(trust_score >= 0.7),
                "weight": 0.25,
                "detail": {"accuracy": round(float(trust_score), 4)}
            })
        except Exception as e:
            checks.append({"id": "trust_error", "label": str(e), "pass": False, "weight": 0.25, "detail": None})
    else:
        checks.append({"id": "trust_missing", "label": "trustworthiness.json missing", "pass": False, "weight": 0.25, "detail": None})

    # ---------- 2. 矛盾检测 (35%) ----------
    contra_file = out_dir / "contradictions.json"
    if contra_file.exists():
        try:
            data = json.loads(contra_file.read_text(encoding="utf-8"))
            key_contradictions = gt.get("key_contradictions", [])
            detected = data.get("contradictions", [])
            covered = 0
            for kc in key_contradictions:
                for d in detected:
                    if kc["claim"].lower() in d.get("claim", "").lower():
                        covered += 1
                        break
            contra_score = covered / len(key_contradictions) if key_contradictions else 1.0
            checks.append({
                "id": "contradiction_detection",
                "label": f"矛盾点检出率 {covered}/{len(key_contradictions)}",
                "pass": bool(contra_score >= 0.6),
                "weight": 0.35,
                "detail": {"coverage": round(float(contra_score), 4)}
            })
        except Exception as e:
            checks.append({"id": "contra_error", "label": str(e), "pass": False, "weight": 0.35, "detail": None})
    else:
        checks.append({"id": "contra_missing", "label": "contradictions.json missing", "pass": False, "weight": 0.35, "detail": None})

    # ---------- 3. 最终报告质量 (40%) ----------
    report_file = out_dir / "final_report.md"
    if report_file.exists():
        content = report_file.read_text(encoding="utf-8")
        required = gt.get("required_elements_in_report", [])
        found = sum(1 for elem in required if elem.lower() in content.lower())
        report_score = found / len(required) if required else 1.0
        # Penalize very short reports (English task: ~1200+ chars expected)
        if len(content) < 1200:
            report_score *= 0.7
        checks.append({
            "id": "report_quality",
            "label": f"报告元素覆盖 {found}/{len(required)}",
            "pass": bool(report_score >= 0.7),
            "weight": 0.40,
            "detail": {"coverage": round(float(report_score), 4), "length": len(content)}
        })
    else:
        checks.append({"id": "report_missing", "label": "final_report.md missing", "pass": False, "weight": 0.40, "detail": None})

    total_score = trust_score * 0.25 + contra_score * 0.35 + report_score * 0.40
    thresholds = gt.get("scoring", {}).get("thresholds", {"excellent": 0.90, "good": 0.75, "pass": 0.60})
    if total_score >= thresholds["excellent"]:
        level = "excellent"
    elif total_score >= thresholds["good"]:
        level = "good"
    elif total_score >= thresholds["pass"]:
        level = "pass"
    else:
        level = "fail"

    result = {
        "task": "012-doc-synthesis",
        "workspace": str(w),
        "outcome_score": round(float(total_score), 4),
        "level": level,
        "checks": _to_native(checks),
        "summary": {
            "trust_accuracy": round(float(trust_score), 4),
            "contradiction_coverage": round(float(contra_score), 4),
            "report_coverage": round(float(report_score), 4)
        }
    }
    # 最终保险序列化
    return json.loads(json.dumps(result, default=str))


# 安全版本（捕获所有异常）
def score_workspace_safe(workspace: Path) -> dict[str, Any]:
    try:
        return score_workspace(workspace)
    except Exception as e:
        return {
            "task": "012-doc-synthesis",
            "workspace": str(workspace),
            "outcome_score": 0.0,
            "level": "error",
            "error": str(e),
            "checks": [],
            "summary": {}
        }

Task 2 →