lakehouse/auditor/schemas/distillation/realdata.test.ts

// Real-data validation test — proves the EvidenceRecord schema fits
// what we ALREADY produce, with the minimum transformation each source
// stream requires. Doubles as the stale-extraction probe: if
// distilled_facts.jsonl rows can't materialize, we know that stream
// has rotted and Phase 2 sources from elsewhere.
//
// Strategy:
//   1. Read first N rows from each source jsonl (skip if missing)
//   2. Apply minimal transformer: add schema_version + provenance,
//      synthesize run_id/task_id when source doesn't carry them
//   3. Validate each materialized record
//   4. Tally pass/fail per source + collect failure reasons
//
// This file is allowed to skip when source files don't exist (fresh
// clone), so it acts as both a CI guard and a real-environment probe.

import { test, expect } from "bun:test";
import { existsSync, readFileSync } from "node:fs";
import { resolve } from "node:path";

import {
  validateEvidenceRecord, EVIDENCE_SCHEMA_VERSION, EvidenceRecord, ModelRole,
} from "./evidence_record";

const ROOT = "/home/profit/lakehouse";
const SAMPLE_PER_SOURCE = 10;

interface SourceProbe {
  source_file: string;
  transform: (row: any, lineNo: number) => Partial<EvidenceRecord> | null;
}

// Canonical 64-char synthetic sha256 for tests where the source row
// lacks one. Pretends the materializer would compute it via
// canonicalSha256(orderedKeys(row)) at Phase 2 time. We use a fixed
// value here to keep the test deterministic; real materialization
// re-hashes per row.
const PLACEHOLDER_SHA = "0000000000000000000000000000000000000000000000000000000000000000";
const RECORDED = "2026-04-26T22:30:00.000Z";

function provFor(source_file: string, lineNo: number, sigHashRaw?: string): EvidenceRecord["provenance"] {
  // Pad shorter hashes (distilled_* uses 16-char) to 64 — mimics
  // canonical recompute.
  const sig = sigHashRaw && /^[0-9a-f]+$/.test(sigHashRaw)
    ? sigHashRaw.padEnd(64, "0").slice(0, 64)
    : PLACEHOLDER_SHA;
  return {
    source_file: source_file.replace(`${ROOT}/`, ""),
    line_offset: lineNo,
    sig_hash: sig,
    recorded_at: RECORDED,
  };
}

const PROBES: SourceProbe[] = [
  {
    source_file: `${ROOT}/data/_kb/distilled_facts.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: String(row.run_id ?? `distilled_facts:${lineNo}`),
      task_id: String(row.source_label ?? `distilled_facts:${lineNo}`),
      timestamp: row.created_at,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/distilled_facts.jsonl`, lineNo, row.sig_hash),
      model_name: row.extractor,
      model_role: "extractor" as ModelRole,
      model_provider: "ollama",
      text: row.text,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/distilled_procedures.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: String(row.run_id ?? `distilled_procedures:${lineNo}`),
      task_id: String(row.source_label ?? `distilled_procedures:${lineNo}`),
      timestamp: row.created_at,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/distilled_procedures.jsonl`, lineNo, row.sig_hash),
      model_name: row.extractor,
      model_role: "extractor" as ModelRole,
      model_provider: "ollama",
      text: row.text,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/contract_analyses.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: `contract_analysis:${row.permit_id}:${new Date(row.ts).getTime()}`,
      task_id: `permit:${row.permit_id}`,
      timestamp: row.ts,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/contract_analyses.jsonl`, lineNo),
      model_role: "executor" as ModelRole,
      retrieved_context: {
        matrix_corpora: Object.keys(row.matrix_corpora ?? {}),
        matrix_hits: row.matrix_hits,
      },
      observer_notes: row.observer_notes ? [row.observer_notes].flat() : undefined,
      observer_verdict: row.observer_verdict,
      observer_confidence: row.observer_conf,
      success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined,
      failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined,
      cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined,
      latency_ms: row.duration_ms,
      text: row.analysis,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/mode_experiments.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: `mode_exec:${new Date(row.ts).getTime()}:${row.file_path ?? "?"}`,
      task_id: row.task_class,
      timestamp: row.ts,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/mode_experiments.jsonl`, lineNo),
      model_name: row.model,
      model_role: "executor" as ModelRole,
      model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud",
      retrieved_context: {
        matrix_corpora: row.sources?.matrix_corpus,
        matrix_chunks_kept: row.sources?.matrix_chunks_kept,
        matrix_chunks_dropped: row.sources?.matrix_chunks_dropped,
        pathway_fingerprints_seen: row.sources?.bug_fingerprints_count,
      },
      latency_ms: row.latency_ms,
      text: row.response,
      source_files: row.file_path ? [row.file_path] : undefined,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/scrum_reviews.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: `scrum:${new Date(row.reviewed_at).getTime()}:${row.file}`,
      task_id: `scrum_review:${row.file}`,
      timestamp: row.reviewed_at,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/scrum_reviews.jsonl`, lineNo),
      model_name: row.accepted_model,
      model_role: "executor" as ModelRole,
      source_files: [row.file],
      success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined,
      text: row.suggestions_preview,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/observer_escalations.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: `obs_esc:${new Date(row.ts).getTime()}:${row.sig_hash}`,
      task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`,
      timestamp: row.ts,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/observer_escalations.jsonl`, lineNo, row.sig_hash),
      model_role: "reviewer" as ModelRole,
      prompt_tokens: row.prompt_tokens,
      completion_tokens: row.completion_tokens,
      text: row.analysis,
    }),
  },
  {
    source_file: `${ROOT}/data/_kb/audit_facts.jsonl`,
    transform: (row: any, lineNo: number) => ({
      run_id: `audit_facts:${row.head_sha}:${lineNo}`,
      task_id: `pr:${row.pr_number}`,
      timestamp: row.extracted_at,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: provFor(`${ROOT}/data/_kb/audit_facts.jsonl`, lineNo),
      model_name: row.extractor,
      model_role: "extractor" as ModelRole,
      // facts/entities/relationships go into text as a JSON dump for now;
      // structured handling lives in Phase 2 where we map to specific
      // EvidenceRecord substructures.
      text: JSON.stringify({
        facts: row.facts?.length ?? 0,
        entities: row.entities?.length ?? 0,
        relationships: row.relationships?.length ?? 0,
      }),
    }),
  },
];

interface ProbeResult {
  source_file: string;
  rows_attempted: number;
  rows_present: boolean;
  passed: number;
  failed: number;
  failure_reasons: string[];   // unique error strings, top 5
}

const RESULTS: ProbeResult[] = [];

for (const probe of PROBES) {
  const sourceLabel = probe.source_file.replace(`${ROOT}/`, "");

  test(`real-data: ${sourceLabel}`, () => {
    const result: ProbeResult = {
      source_file: sourceLabel,
      rows_attempted: 0,
      rows_present: false,
      passed: 0,
      failed: 0,
      failure_reasons: [],
    };

    if (!existsSync(probe.source_file)) {
      RESULTS.push(result);
      // Skip silently — fresh clones won't have these files
      return;
    }

    result.rows_present = true;
    const lines = readFileSync(probe.source_file, "utf8").split("\n").filter(Boolean).slice(0, SAMPLE_PER_SOURCE);
    const reasons = new Set<string>();

    for (let i = 0; i < lines.length; i++) {
      result.rows_attempted++;
      let row: unknown;
      try { row = JSON.parse(lines[i]); }
      catch { continue; }

      const transformed = probe.transform(row, i);
      if (!transformed) continue;

      const v = validateEvidenceRecord(transformed);
      if (v.valid) result.passed++;
      else {
        result.failed++;
        for (const e of v.errors) reasons.add(e);
      }
    }
    result.failure_reasons = Array.from(reasons).slice(0, 5);
    RESULTS.push(result);

    // Test passes as long as we attempted something and got a result.
    // Per-source pass/fail counts are reported in the markdown writeup.
    expect(result.rows_attempted).toBeGreaterThanOrEqual(0);
  });
}

test("real-data: emit markdown report", () => {
  const md: string[] = [];
  md.push("# Real-data validation report");
  md.push("");
  md.push("Schema = EvidenceRecord v" + EVIDENCE_SCHEMA_VERSION + ". Sample = first " + SAMPLE_PER_SOURCE + " rows per source.");
  md.push("");
  md.push("| Source | Present | Rows | Pass | Fail | Pass% |");
  md.push("|---|---|---|---|---|---|");
  for (const r of RESULTS) {
    const pct = r.rows_attempted > 0 ? Math.round(100 * r.passed / r.rows_attempted) + "%" : "—";
    md.push(`| ${r.source_file} | ${r.rows_present ? "✓" : "—"} | ${r.rows_attempted} | ${r.passed} | ${r.failed} | ${pct} |`);
  }
  md.push("");
  let hasFailures = false;
  for (const r of RESULTS) {
    if (r.failed > 0) {
      hasFailures = true;
      md.push(`## Failures in ${r.source_file}`);
      for (const reason of r.failure_reasons) md.push(`- \`${reason}\``);
      md.push("");
    }
  }
  if (!hasFailures) {
    md.push("**No failures across all probed sources.** Every materialized record validates against EvidenceRecord v1.");
    md.push("");
  }
  // Stale extraction probe: explicit pass/fail
  const distilledFacts = RESULTS.find(r => r.source_file.endsWith("distilled_facts.jsonl"));
  const distilledProc = RESULTS.find(r => r.source_file.endsWith("distilled_procedures.jsonl"));
  md.push("## Stale-extraction probe");
  md.push("");
  if (distilledFacts && distilledFacts.rows_present && distilledFacts.passed > 0) {
    md.push(`- **distilled_facts.jsonl:** ${distilledFacts.passed}/${distilledFacts.rows_attempted} materialize cleanly. Stream is alive at the schema level.`);
  } else if (distilledFacts && !distilledFacts.rows_present) {
    md.push(`- **distilled_facts.jsonl:** missing — stale or never produced. Phase 2 sources from live streams instead.`);
  } else {
    md.push(`- **distilled_facts.jsonl:** present but materialization failures; treat as suspect, prefer mode_experiments + scrum_reviews.`);
  }
  if (distilledProc && distilledProc.rows_present && distilledProc.passed > 0) {
    md.push(`- **distilled_procedures.jsonl:** ${distilledProc.passed}/${distilledProc.rows_attempted} materialize cleanly.`);
  }
  md.push("");

  // Write the markdown to a stable path and stdout
  const out = md.join("\n");
  Bun.write(`${ROOT}/data/_kb/realdata_validation_report.md`, out);
  console.log("\n" + out);
});