// EvidenceRecord — the unified per-execution-trace record that the // Evidence View emits and the Success Scorer reads. // // Derived from now.md spec + reconciliation of two existing prototypes: // - distilled_facts.jsonl / distilled_procedures.jsonl (LLM-extracted // text with run_id + sig_hash + extractor + verifier + embedding) // - contract_analyses.jsonl (observer integration + retrieval // telemetry + cost + duration) // // Required fields are the ones every record MUST have for traceability: // run_id, task_id, timestamp, schema_version, provenance. Everything // else is typed-but-optional because no single source has all of them // — the Evidence View materializes them by JOINing across streams when // the source data is present. // // schema_version starts at 1 and gets bumped on breaking changes. // Validators MUST check schema_version and refuse unknown values so a // future v2 reader doesn't silently accept v1 records (or vice versa). import { ValidationResult, Provenance, requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray, } from "./types"; export const EVIDENCE_SCHEMA_VERSION = 1; export type ModelRole = | "executor" // produced the answer (e.g. scrum reviewer, mode runner LLM call) | "reviewer" // judged an executor output (e.g. observer, hand-review) | "extractor" // pulled structured data from text (e.g. fact_extractor) | "verifier" // confirmed/rejected an extracted claim (verifier in distilled_*) | "categorizer" // assigned a category (categorizer in distilled_*) | "tiebreaker" // resolved a consensus split | "applier" // landed code (scrum_applier) | "embedder" // produced embeddings | "other"; export interface EvidenceRecord { // ── Identity ── // run_id ties this record to a specific execution. Sources use it // inconsistently (some stream-level, some per-call). The Evidence // View canonicalizes to per-call; if the source is stream-level, // synthesize as `${stream_run_id}:${row_index}`. run_id: string; // task_id groups records by logical task (e.g. one PR = one task_id // across multiple per-call runs). Defaults to run_id when no group // exists — never null. task_id: string; // ISO 8601 of when the EXECUTION happened, not when this record was // materialized. Use the source row's timestamp; provenance carries // the materialization time separately. timestamp: string; schema_version: number; // ── Provenance ── (required — no record without source linkage) provenance: Provenance; // ── Model attribution (optional) ── model_name?: string; // e.g. "kimi-k2:1t", "gpt-oss:120b" model_provider?: string; // e.g. "ollama_cloud", "openrouter", "ollama" model_role?: ModelRole; // ── Content hashes (optional) ── // sha256 of the full input prompt and full output content. Pre- // computed so the Evidence Index can dedup across re-runs of the // same prompt without re-hashing. input_hash?: string; output_hash?: string; // ── Repo + execution context ── source_files?: string[]; // files the run touched/read commands_run?: string[]; // shell commands or tool calls fired retrieved_context?: { // what the model saw via retrieval matrix_corpora?: string[]; matrix_hits?: number; matrix_chunks_kept?: number; matrix_chunks_dropped?: number; pathway_fingerprints_seen?: number; }; // ── Observer + scratchpad ── observer_notes?: string[]; // observer.review() free-form notes observer_verdict?: "accept" | "reject" | "cycle" | string; observer_confidence?: number; // 0-100 scratchpad_summary?: string; // tree-split scratchpad text or hash ref // ── Outcome markers ── // Both arrays exist because a run can have multiple succeeded gates // AND multiple failed gates simultaneously. Empty arrays are valid; // missing arrays are also valid (means "no evidence either way"). success_markers?: string[]; // e.g. "cargo_green", "tests_passed", "anchor_grounded" failure_markers?: string[]; // e.g. "warning_count_up", "rationale_mismatch", "consensus_split" // ── Validation telemetry ── validation_results?: { grounded_fraction?: number; // mode_compare grounding % schema_valid?: boolean; pathway_replay_succeeded?: boolean; [key: string]: unknown; }; // ── Human-in-loop ── human_override?: { overrider: string; // user identifier decision: "accept" | "reject" | "needs_review"; reason: string; overridden_at: string; // ISO 8601 } | null; // ── Performance ── cost_usd?: number; latency_ms?: number; prompt_tokens?: number; completion_tokens?: number; // ── Free-form text content (the actual run output) ── // Optional because some sources are pure metadata (auto_apply.jsonl) // and have no text payload. Present for distilled_*, contract_analyses, // mode_experiments, scrum_reviews etc. text?: string; // ── Domain-specific metadata bucket ── // Source-specific fields that don't earn a top-level slot. e.g. // contract_analyses rows carry `contractor` here; mode_experiments // could carry `corpus_set`. Typed scalar values only — keep this // small or it becomes a junk drawer. Added 2026-04-27 (Kimi audit // flagged `(ev as any).contractor` schema bypass at export_sft.ts:126). metadata?: Record; } export function validateEvidenceRecord(input: unknown): ValidationResult { const errors: string[] = []; if (typeof input !== "object" || input === null) { return { valid: false, errors: ["expected object, got " + (input === null ? "null" : typeof input)] }; } const r = input as Record; // Required let ok = true; ok = requireString(r.run_id, "run_id", errors) && ok; ok = requireString(r.task_id, "task_id", errors) && ok; ok = requireIsoTimestamp(r.timestamp, "timestamp", errors) && ok; ok = requireProvenance(r.provenance, "provenance", errors) && ok; if (r.schema_version !== EVIDENCE_SCHEMA_VERSION) { errors.push(`schema_version: expected ${EVIDENCE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`); ok = false; } // Optional but typed-when-present if (r.model_role !== undefined) { const valid: ModelRole[] = ["executor", "reviewer", "extractor", "verifier", "categorizer", "tiebreaker", "applier", "embedder", "other"]; if (!valid.includes(r.model_role as ModelRole)) { errors.push(`model_role: must be one of ${valid.join("|")}, got ${JSON.stringify(r.model_role)}`); ok = false; } } if (r.input_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.input_hash))) { errors.push("input_hash: must be hex sha256 when present"); ok = false; } if (r.output_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.output_hash))) { errors.push("output_hash: must be hex sha256 when present"); ok = false; } if (r.source_files !== undefined && !requireStringArray(r.source_files, "source_files", errors)) ok = false; if (r.commands_run !== undefined && !requireStringArray(r.commands_run, "commands_run", errors)) ok = false; if (r.success_markers !== undefined && !requireStringArray(r.success_markers, "success_markers", errors)) ok = false; if (r.failure_markers !== undefined && !requireStringArray(r.failure_markers, "failure_markers", errors)) ok = false; if (r.observer_notes !== undefined && !requireStringArray(r.observer_notes, "observer_notes", errors)) ok = false; if (r.observer_confidence !== undefined) { if (!requireNumber(r.observer_confidence, "observer_confidence", errors)) ok = false; else if ((r.observer_confidence as number) < 0 || (r.observer_confidence as number) > 100) { errors.push("observer_confidence: must be in [0, 100]"); ok = false; } } if (r.human_override !== undefined && r.human_override !== null) { const ho = r.human_override as Record; if (typeof ho !== "object") { errors.push("human_override: expected object or null"); ok = false; } else { ok = requireString(ho.overrider, "human_override.overrider", errors) && ok; ok = requireString(ho.reason, "human_override.reason", errors) && ok; ok = requireIsoTimestamp(ho.overridden_at, "human_override.overridden_at", errors) && ok; if (!["accept", "reject", "needs_review"].includes(ho.decision as string)) { errors.push(`human_override.decision: must be accept|reject|needs_review`); ok = false; } } } if (!ok) return { valid: false, errors }; return { valid: true, value: r as unknown as EvidenceRecord }; }