lakehouse/auditor/schemas/distillation/evidence_record.ts

// EvidenceRecord — the unified per-execution-trace record that the
// Evidence View emits and the Success Scorer reads.
//
// Derived from now.md spec + reconciliation of two existing prototypes:
//   - distilled_facts.jsonl / distilled_procedures.jsonl (LLM-extracted
//     text with run_id + sig_hash + extractor + verifier + embedding)
//   - contract_analyses.jsonl (observer integration + retrieval
//     telemetry + cost + duration)
//
// Required fields are the ones every record MUST have for traceability:
// run_id, task_id, timestamp, schema_version, provenance. Everything
// else is typed-but-optional because no single source has all of them
// — the Evidence View materializes them by JOINing across streams when
// the source data is present.
//
// schema_version starts at 1 and gets bumped on breaking changes.
// Validators MUST check schema_version and refuse unknown values so a
// future v2 reader doesn't silently accept v1 records (or vice versa).

import {
  ValidationResult, Provenance,
  requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";

export const EVIDENCE_SCHEMA_VERSION = 1;

export type ModelRole =
  | "executor"      // produced the answer (e.g. scrum reviewer, mode runner LLM call)
  | "reviewer"      // judged an executor output (e.g. observer, hand-review)
  | "extractor"     // pulled structured data from text (e.g. fact_extractor)
  | "verifier"      // confirmed/rejected an extracted claim (verifier in distilled_*)
  | "categorizer"   // assigned a category (categorizer in distilled_*)
  | "tiebreaker"    // resolved a consensus split
  | "applier"       // landed code (scrum_applier)
  | "embedder"      // produced embeddings
  | "other";

export interface EvidenceRecord {
  // ── Identity ──
  // run_id ties this record to a specific execution. Sources use it
  // inconsistently (some stream-level, some per-call). The Evidence
  // View canonicalizes to per-call; if the source is stream-level,
  // synthesize as `${stream_run_id}:${row_index}`.
  run_id: string;

  // task_id groups records by logical task (e.g. one PR = one task_id
  // across multiple per-call runs). Defaults to run_id when no group
  // exists — never null.
  task_id: string;

  // ISO 8601 of when the EXECUTION happened, not when this record was
  // materialized. Use the source row's timestamp; provenance carries
  // the materialization time separately.
  timestamp: string;

  schema_version: number;

  // ── Provenance ── (required — no record without source linkage)
  provenance: Provenance;

  // ── Model attribution (optional) ──
  model_name?: string;          // e.g. "kimi-k2:1t", "gpt-oss:120b"
  model_provider?: string;      // e.g. "ollama_cloud", "openrouter", "ollama"
  model_role?: ModelRole;

  // ── Content hashes (optional) ──
  // sha256 of the full input prompt and full output content. Pre-
  // computed so the Evidence Index can dedup across re-runs of the
  // same prompt without re-hashing.
  input_hash?: string;
  output_hash?: string;

  // ── Repo + execution context ──
  source_files?: string[];      // files the run touched/read
  commands_run?: string[];      // shell commands or tool calls fired
  retrieved_context?: {         // what the model saw via retrieval
    matrix_corpora?: string[];
    matrix_hits?: number;
    matrix_chunks_kept?: number;
    matrix_chunks_dropped?: number;
    pathway_fingerprints_seen?: number;
  };

  // ── Observer + scratchpad ──
  observer_notes?: string[];    // observer.review() free-form notes
  observer_verdict?: "accept" | "reject" | "cycle" | string;
  observer_confidence?: number; // 0-100
  scratchpad_summary?: string;  // tree-split scratchpad text or hash ref

  // ── Outcome markers ──
  // Both arrays exist because a run can have multiple succeeded gates
  // AND multiple failed gates simultaneously. Empty arrays are valid;
  // missing arrays are also valid (means "no evidence either way").
  success_markers?: string[];   // e.g. "cargo_green", "tests_passed", "anchor_grounded"
  failure_markers?: string[];   // e.g. "warning_count_up", "rationale_mismatch", "consensus_split"

  // ── Validation telemetry ──
  validation_results?: {
    grounded_fraction?: number; // mode_compare grounding %
    schema_valid?: boolean;
    pathway_replay_succeeded?: boolean;
    [key: string]: unknown;
  };

  // ── Human-in-loop ──
  human_override?: {
    overrider: string;          // user identifier
    decision: "accept" | "reject" | "needs_review";
    reason: string;
    overridden_at: string;      // ISO 8601
  } | null;

  // ── Performance ──
  cost_usd?: number;
  latency_ms?: number;
  prompt_tokens?: number;
  completion_tokens?: number;

  // ── Free-form text content (the actual run output) ──
  // Optional because some sources are pure metadata (auto_apply.jsonl)
  // and have no text payload. Present for distilled_*, contract_analyses,
  // mode_experiments, scrum_reviews etc.
  text?: string;

  // ── Domain-specific metadata bucket ──
  // Source-specific fields that don't earn a top-level slot. e.g.
  // contract_analyses rows carry `contractor` here; mode_experiments
  // could carry `corpus_set`. Typed scalar values only — keep this
  // small or it becomes a junk drawer. Added 2026-04-27 (Kimi audit
  // flagged `(ev as any).contractor` schema bypass at export_sft.ts:126).
  metadata?: Record<string, string | number | boolean>;
}

export function validateEvidenceRecord(input: unknown): ValidationResult<EvidenceRecord> {
  const errors: string[] = [];

  if (typeof input !== "object" || input === null) {
    return { valid: false, errors: ["expected object, got " + (input === null ? "null" : typeof input)] };
  }
  const r = input as Record<string, unknown>;

  // Required
  let ok = true;
  ok = requireString(r.run_id, "run_id", errors) && ok;
  ok = requireString(r.task_id, "task_id", errors) && ok;
  ok = requireIsoTimestamp(r.timestamp, "timestamp", errors) && ok;
  ok = requireProvenance(r.provenance, "provenance", errors) && ok;

  if (r.schema_version !== EVIDENCE_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${EVIDENCE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }

  // Optional but typed-when-present
  if (r.model_role !== undefined) {
    const valid: ModelRole[] = ["executor", "reviewer", "extractor", "verifier", "categorizer", "tiebreaker", "applier", "embedder", "other"];
    if (!valid.includes(r.model_role as ModelRole)) {
      errors.push(`model_role: must be one of ${valid.join("|")}, got ${JSON.stringify(r.model_role)}`);
      ok = false;
    }
  }
  if (r.input_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.input_hash))) {
    errors.push("input_hash: must be hex sha256 when present");
    ok = false;
  }
  if (r.output_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.output_hash))) {
    errors.push("output_hash: must be hex sha256 when present");
    ok = false;
  }
  if (r.source_files !== undefined && !requireStringArray(r.source_files, "source_files", errors)) ok = false;
  if (r.commands_run !== undefined && !requireStringArray(r.commands_run, "commands_run", errors)) ok = false;
  if (r.success_markers !== undefined && !requireStringArray(r.success_markers, "success_markers", errors)) ok = false;
  if (r.failure_markers !== undefined && !requireStringArray(r.failure_markers, "failure_markers", errors)) ok = false;
  if (r.observer_notes !== undefined && !requireStringArray(r.observer_notes, "observer_notes", errors)) ok = false;

  if (r.observer_confidence !== undefined) {
    if (!requireNumber(r.observer_confidence, "observer_confidence", errors)) ok = false;
    else if ((r.observer_confidence as number) < 0 || (r.observer_confidence as number) > 100) {
      errors.push("observer_confidence: must be in [0, 100]");
      ok = false;
    }
  }

  if (r.human_override !== undefined && r.human_override !== null) {
    const ho = r.human_override as Record<string, unknown>;
    if (typeof ho !== "object") {
      errors.push("human_override: expected object or null");
      ok = false;
    } else {
      ok = requireString(ho.overrider, "human_override.overrider", errors) && ok;
      ok = requireString(ho.reason, "human_override.reason", errors) && ok;
      ok = requireIsoTimestamp(ho.overridden_at, "human_override.overridden_at", errors) && ok;
      if (!["accept", "reject", "needs_review"].includes(ho.decision as string)) {
        errors.push(`human_override.decision: must be accept|reject|needs_review`);
        ok = false;
      }
    }
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as EvidenceRecord };
}