The original OPEN #2 line called for "SFT export pipeline + audit_baselines lineage." Commit 7bb432f shipped the SFT export. This commit ports the audit_baselines half — the longitudinal drift signal that distinguishes "metrics shifted because the world changed" from "metrics shifted because we broke something." Mirrors Rust scripts/distillation/audit_full.ts's substrate: - LoadLastBaseline(path) reads the most recent entry from data/_kb/audit_baselines.jsonl. Returns (nil, nil) on missing file (first run), errors on truncated last line (partial-write detection — operators don't lose drift signal silently). - AppendBaseline(path, baseline) appends one entry as a JSON line. Atomic at the line level via bufio + O_APPEND. Creates the parent directory if missing. - BuildAuditDriftTable(prior, current, threshold) computes per-metric drift. flag values mirror Rust exactly: first_run, ok, warn. DefaultDriftWarnThreshold = 0.20 = Rust's 20%. - FormatAuditDriftTable renders a fixed-width text grid for stdout dumps in audit-full runs. Edge cases handled: - Zero-baseline: prior=0 means no division — PctChange stays nil. current=0 → ok (no change). current>0 → warn (zero→nonzero is always notable, never silently fine). - New metric in current: flagged first_run, not "0%-change". Operators see "this is a new signal we haven't tracked before." - Sort: stable by metric name for deterministic JSON output and clean CI diffs. Generic on metric name (vs Rust's pinned p2_evidence_rows etc.): the Rust phase numbering doesn't translate to Go directly. The AuditBaselineRustCompat constant pins the Rust names so operators running both runtimes use the same labels, which makes drift comparison meaningful across the two pipelines. 13 new tests covering: missing file, last-line-wins, blank-line tolerance, malformed-line errors, append round-trip, append-to- existing, schema validation, first-run, threshold boundary, zero-baseline, new-metric-in-current, sort-by-metric stability, formatter output rendering. OPEN #2's "audit_baselines lineage" half now closed. The distillation package surface is at parity with the Rust pipeline: scorer, scored runs, SFT export, audit baselines all available on the Go side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
257 lines
8.6 KiB
Go
257 lines
8.6 KiB
Go
package distillation
|
|
|
|
// Audit-baseline lineage — the longitudinal signal that distinguishes
|
|
// "metrics shifted because the world changed" from "metrics shifted
|
|
// because we broke something." Mirrors the Rust audit_full.ts
|
|
// LoadBaseline/AppendBaseline/buildDriftTable shape so a Go-side
|
|
// audit run can be compared against Rust-side baselines and
|
|
// vice-versa during the migration.
|
|
//
|
|
// Storage: data/_kb/audit_baselines.jsonl, one AuditBaseline per
|
|
// line, append-only. The LAST line is the most recent. New runs
|
|
// read the prior baseline, compute drift vs current metrics, then
|
|
// append a fresh entry.
|
|
//
|
|
// Why generic on metric name (vs Rust's pinned p2_evidence_rows
|
|
// etc.): the Rust phase numbering (p0..p7) doesn't translate to Go
|
|
// directly. Operators with mixed Rust+Go pipelines should use the
|
|
// SAME metric names on both sides so the drift table compares
|
|
// like-for-like. Helper constants below pin the Rust-compat names
|
|
// for callers running both runtimes.
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// AuditBaseline is one entry in the audit_baselines.jsonl
|
|
// longitudinal log. Schema-stable; new metrics land as new keys
|
|
// in the Metrics map (additive — readers tolerate unknown keys).
|
|
type AuditBaseline struct {
|
|
RecordedAt string `json:"recorded_at"` // ISO 8601 UTC
|
|
GitCommit string `json:"git_commit,omitempty"` // sha of the run's HEAD
|
|
Metrics map[string]int64 `json:"metrics"`
|
|
}
|
|
|
|
// AuditBaselineRustCompat lists the metric names the Rust pipeline
|
|
// emits at audit_full.ts. Go-side callers running an equivalent
|
|
// audit should use these names so drift compares across runtimes.
|
|
// Adding new names here requires the Rust side to mint them too.
|
|
var AuditBaselineRustCompat = []string{
|
|
"p2_evidence_rows",
|
|
"p2_evidence_skips",
|
|
"p3_accepted",
|
|
"p3_partial",
|
|
"p3_rejected",
|
|
"p3_human",
|
|
"p4_rag_rows",
|
|
"p4_sft_rows",
|
|
"p4_pref_pairs",
|
|
"p4_total_quarantined",
|
|
}
|
|
|
|
// DefaultBaselinePath returns the canonical audit baselines path
|
|
// rooted at the lakehouse data dir. Match Rust's BASELINE_PATH_FOR.
|
|
func DefaultBaselinePath(root string) string {
|
|
return filepath.Join(root, "data", "_kb", "audit_baselines.jsonl")
|
|
}
|
|
|
|
// LoadLastBaseline reads audit_baselines.jsonl and returns the
|
|
// most recent entry — i.e. the LAST non-empty JSON line. Missing
|
|
// file or empty file returns (nil, nil), not an error: a fresh
|
|
// pipeline has no baseline yet, and the caller should treat that
|
|
// as "first run" via BuildAuditDriftTable's nil-prior handling.
|
|
//
|
|
// Malformed last line returns an error (rather than silently
|
|
// skipping to the previous line) so operators don't lose drift
|
|
// signal under partial-write corruption.
|
|
func LoadLastBaseline(path string) (*AuditBaseline, error) {
|
|
data, err := os.ReadFile(path)
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read baselines: %w", err)
|
|
}
|
|
lines := strings.Split(string(data), "\n")
|
|
// Walk back to the last non-empty line.
|
|
for i := len(lines) - 1; i >= 0; i-- {
|
|
s := strings.TrimSpace(lines[i])
|
|
if s == "" {
|
|
continue
|
|
}
|
|
var b AuditBaseline
|
|
if err := json.Unmarshal([]byte(s), &b); err != nil {
|
|
return nil, fmt.Errorf("decode last baseline (line %d): %w", i+1, err)
|
|
}
|
|
return &b, nil
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// AppendBaseline appends one AuditBaseline as a JSON line to
|
|
// audit_baselines.jsonl. Creates the parent directory if missing.
|
|
// Atomic write at the line level: a partial write on disk-full or
|
|
// crash leaves the file with at most one truncated trailing line,
|
|
// which LoadLastBaseline will surface as a decode error.
|
|
func AppendBaseline(path string, b AuditBaseline) error {
|
|
if b.RecordedAt == "" {
|
|
return errors.New("audit_baseline: RecordedAt required")
|
|
}
|
|
if b.Metrics == nil {
|
|
return errors.New("audit_baseline: Metrics required (use empty map for zero-metric run)")
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
|
return fmt.Errorf("mkdir baseline dir: %w", err)
|
|
}
|
|
line, err := json.Marshal(b)
|
|
if err != nil {
|
|
return fmt.Errorf("encode baseline: %w", err)
|
|
}
|
|
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
|
|
if err != nil {
|
|
return fmt.Errorf("open baselines: %w", err)
|
|
}
|
|
defer f.Close()
|
|
w := bufio.NewWriter(f)
|
|
if _, err := w.Write(line); err != nil {
|
|
return fmt.Errorf("write baseline: %w", err)
|
|
}
|
|
if err := w.WriteByte('\n'); err != nil {
|
|
return fmt.Errorf("write newline: %w", err)
|
|
}
|
|
return w.Flush()
|
|
}
|
|
|
|
// AuditDriftFlag categorizes a single metric's drift verdict.
|
|
// Mirrors the Rust DriftRow.flag values exactly.
|
|
type AuditDriftFlag string
|
|
|
|
const (
|
|
AuditDriftFlagFirstRun AuditDriftFlag = "first_run" // no prior baseline → can't compute change
|
|
AuditDriftFlagOK AuditDriftFlag = "ok" // |Δ%| ≤ threshold
|
|
AuditDriftFlagWarn AuditDriftFlag = "warn" // |Δ%| > threshold
|
|
)
|
|
|
|
// DefaultDriftWarnThreshold is 20% — matches Rust's hard-coded
|
|
// `Math.abs(pct) > 0.20`. Operators tuning sensitivity per metric
|
|
// can pass a different value to BuildAuditDriftTable.
|
|
const DefaultDriftWarnThreshold = 0.20
|
|
|
|
// AuditDriftRow is one metric's drift verdict. PctChange is nil
|
|
// when prior baseline was zero (division-by-zero) OR when this is
|
|
// the first run. Encoded as *float64 so JSON omits the field
|
|
// rather than emitting 0.0 for "unknowable" cases.
|
|
type AuditDriftRow struct {
|
|
Metric string `json:"metric"`
|
|
Baseline *int64 `json:"baseline"`
|
|
Current int64 `json:"current"`
|
|
PctChange *float64 `json:"pct_change"`
|
|
Flag AuditDriftFlag `json:"flag"`
|
|
}
|
|
|
|
// BuildAuditDriftTable computes per-metric drift between a prior
|
|
// baseline (nil = first run) and the current metric snapshot. The
|
|
// result is sorted by metric name for stable display.
|
|
//
|
|
// Threshold is the absolute percent-change above which a metric is
|
|
// flagged "warn". Pass DefaultDriftWarnThreshold (0.20 = 20%) to
|
|
// match Rust audit_full.ts. Use a per-metric threshold map by
|
|
// calling BuildAuditDriftTable once per metric subset.
|
|
func BuildAuditDriftTable(prior *AuditBaseline, current map[string]int64, threshold float64) []AuditDriftRow {
|
|
if threshold <= 0 {
|
|
threshold = DefaultDriftWarnThreshold
|
|
}
|
|
// Union of metric names so a metric that disappeared from
|
|
// current still surfaces as "current=0, drifted -100%".
|
|
names := make(map[string]struct{}, len(current))
|
|
for k := range current {
|
|
names[k] = struct{}{}
|
|
}
|
|
if prior != nil {
|
|
for k := range prior.Metrics {
|
|
names[k] = struct{}{}
|
|
}
|
|
}
|
|
rows := make([]AuditDriftRow, 0, len(names))
|
|
for name := range names {
|
|
row := AuditDriftRow{Metric: name, Current: current[name]}
|
|
if prior == nil {
|
|
row.Flag = AuditDriftFlagFirstRun
|
|
rows = append(rows, row)
|
|
continue
|
|
}
|
|
priorVal, hadPrior := prior.Metrics[name]
|
|
if !hadPrior {
|
|
// New metric in current — treat as first-run for THIS metric.
|
|
row.Flag = AuditDriftFlagFirstRun
|
|
rows = append(rows, row)
|
|
continue
|
|
}
|
|
row.Baseline = &priorVal
|
|
if priorVal == 0 {
|
|
// Division-by-zero: leave PctChange nil. If current is
|
|
// also 0 → ok (no change). Otherwise → warn (the metric
|
|
// went from zero to non-zero, which is always notable).
|
|
if current[name] == 0 {
|
|
row.Flag = AuditDriftFlagOK
|
|
} else {
|
|
row.Flag = AuditDriftFlagWarn
|
|
}
|
|
rows = append(rows, row)
|
|
continue
|
|
}
|
|
pct := float64(current[name]-priorVal) / float64(priorVal)
|
|
row.PctChange = &pct
|
|
if math.Abs(pct) > threshold {
|
|
row.Flag = AuditDriftFlagWarn
|
|
} else {
|
|
row.Flag = AuditDriftFlagOK
|
|
}
|
|
rows = append(rows, row)
|
|
}
|
|
// Sort for stable display + deterministic JSON output. Bubble-
|
|
// sort by name; size is at most a few dozen metrics, so the
|
|
// O(n²) cost is irrelevant.
|
|
for i := 0; i < len(rows); i++ {
|
|
for j := i + 1; j < len(rows); j++ {
|
|
if rows[i].Metric > rows[j].Metric {
|
|
rows[i], rows[j] = rows[j], rows[i]
|
|
}
|
|
}
|
|
}
|
|
return rows
|
|
}
|
|
|
|
// FormatAuditDriftTable renders a drift table as a fixed-width
|
|
// text grid — useful for stdout dumps in audit-full runs. Matches
|
|
// the Rust output shape so an operator can grep across runtimes
|
|
// without re-learning the layout.
|
|
func FormatAuditDriftTable(rows []AuditDriftRow) string {
|
|
if len(rows) == 0 {
|
|
return "(no metrics)\n"
|
|
}
|
|
var buf bytes.Buffer
|
|
fmt.Fprintf(&buf, "%-26s %12s %12s %10s %s\n", "metric", "baseline", "current", "Δ%", "flag")
|
|
for _, r := range rows {
|
|
baseline := "-"
|
|
if r.Baseline != nil {
|
|
baseline = fmt.Sprintf("%d", *r.Baseline)
|
|
}
|
|
pct := "-"
|
|
if r.PctChange != nil {
|
|
pct = fmt.Sprintf("%+.1f%%", *r.PctChange*100)
|
|
}
|
|
fmt.Fprintf(&buf, "%-26s %12s %12d %10s %s\n",
|
|
r.Metric, baseline, r.Current, pct, r.Flag)
|
|
}
|
|
return buf.String()
|
|
}
|