profit ac01fffd9a checkpoint: matrix-agent-validated (2026-04-25)
Architectural snapshot of the lakehouse codebase at the point where the
full matrix-driven agent loop with Mem0 versioning + deletion was
validated end-to-end.

WHAT THIS REPO IS
A clean single-commit snapshot of the lakehouse code. Heavy test data
(.parquet datasets, vector indexes) excluded — see REPLICATION.md for
regen path. Full lakehouse history at git.agentview.dev/profit/lakehouse.

WHAT WAS PROVEN
- Vector retrieval across multi-corpora matrix (chicago_permits + entity
  briefs + sec_tickers + distilled procedural + llm_team runs)
- Observer hand-review (cloud + heuristic fallback) gating each candidate
- Local-model agent loop (qwen3.5:latest) with tool use + scratchpad
- Playbook seal on success → next-iter retrieval surfaces it as preamble
- Mem0 versioning + deletion in pathway_memory:
    * UPSERT: ADD on new workflow, UPDATE bumps replay_count on identical
    * REVISE: chains versions, parent.superseded_at + superseded_by stamped
    * RETIRE: marks specific trace retired with reason, excluded from retrieval
    * HISTORY: walks chain root→tip, cycle-safe

KEY DIRECTORIES
- crates/vectord/src/pathway_memory.rs — Mem0 ops live here
- crates/vectord/src/playbook_memory.rs — original Mem0 reference
- tests/agent_test/ — local-model agent harness + PRD + session archives
- scripts/dump_raw_corpus.sh — MinIO bucket dump (raw test corpus)
- scripts/vectorize_raw_corpus.ts — corpus → vector indexes
- scripts/analyze_chicago_contracts.ts — real inference pipeline
- scripts/seal_agent_playbook.ts — Mem0 upsert from agent traces

Replication: see REPLICATION.md for Debian 13 clean install + cloud-only
adaptation (no local Ollama).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 19:43:27 -05:00

185 lines
7.6 KiB
TypeScript

// Static diff check — grep-style, no AST, no LLM. Looks for patterns
// that are high-signal evidence of placeholder code.
//
// Findings are severity-graded:
// block — explicit non-impl markers (unimplemented!, todo!,
// panic!("not implemented"), throw new Error("not implemented"))
// warn — TODO / FIXME / XXX / HACK comments on added lines,
// new struct fields with no read-site anywhere in the diff,
// suspiciously-empty function bodies ({ Ok(()) } / {} when
// the commit message claims the fn "implements" something)
// info — hardcoded "test" / "dummy" / "placeholder" strings in
// added lines (could be real, just flag for inspection)
//
// Consumes: raw unified diff text from Gitea.
import type { Finding } from "../types.ts";
// Rust + TypeScript patterns that almost always indicate "this is
// not actually implemented yet."
const BLOCK_PATTERNS: Array<{ re: RegExp; why: string }> = [
{ re: /\bunimplemented!\s*\(/, why: "unimplemented!() macro call" },
{ re: /\btodo!\s*\(/, why: "todo!() macro call" },
{ re: /panic!\s*\(\s*"(?:not implemented|TODO|not yet|unimpl)/i, why: "panic! with not-implemented message" },
{ re: /throw\s+new\s+Error\s*\(\s*['"](?:not implemented|TODO|unimpl)/i, why: "throw Error 'not implemented'" },
];
const WARN_COMMENT_PATTERNS: Array<{ re: RegExp; why: string }> = [
{ re: /^\+.*\/\/\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
{ re: /^\+.*#\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
];
const INFO_HARDCODED_PATTERNS: Array<{ re: RegExp; why: string }> = [
{ re: /"(?:placeholder|dummy|foobar|xxx|replaceme|changeme)"/i, why: "suspicious hardcoded string" },
];
export function runStaticCheck(diff: string): Finding[] {
const findings: Finding[] = [];
// Per-file walk: only look at ADDED lines (prefix '+' but not '+++'
// which is the diff header).
const perFile = splitDiffByFile(diff);
for (const [path, lines] of perFile) {
// Skip diff bookkeeping + pure-delete files
if (!lines.some(l => l.startsWith("+") && !l.startsWith("+++"))) continue;
// The auditor's own check files literally contain the BLOCK
// patterns as regex definitions (BLOCK_PATTERNS in this file,
// prompt examples in inference.ts). Skipping BLOCK scan on these
// specific paths prevents the checker from self-flagging its own
// string literals. WARN/INFO patterns still run — those genuinely
// could indicate problems in the checker's own code (TODO
// comments don't self-define).
const isAuditorCheckerFile = path.startsWith("auditor/checks/") ||
path.startsWith("auditor/fixtures/");
for (let idx = 0; idx < lines.length; idx++) {
const line = lines[idx];
if (!line.startsWith("+") || line.startsWith("+++")) continue;
const added = line.slice(1);
if (!isAuditorCheckerFile) {
for (const { re, why } of BLOCK_PATTERNS) {
const m = added.match(re);
if (m && typeof m.index === "number") {
// Skip if the match sits inside a quoted string literal —
// this is how rubric files (tests/real-world/*, prompt
// templates) legitimately reference the patterns they
// guard against, without actually executing them.
if (isInsideQuotedString(added, m.index)) continue;
findings.push({
check: "static",
severity: "block",
summary: `${why} in ${path}`,
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
});
}
}
}
for (const { re, why } of WARN_COMMENT_PATTERNS) {
if (re.test(line)) {
findings.push({
check: "static",
severity: "warn",
summary: `${why} in ${path}`,
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
});
}
}
for (const { re, why } of INFO_HARDCODED_PATTERNS) {
if (re.test(added)) {
findings.push({
check: "static",
severity: "info",
summary: `${why} in ${path}`,
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
});
}
}
}
// "Field added but never read" heuristic — catches exactly the
// Phase 45 DocRef placeholder pattern. Limited to the diff itself:
// we're not doing a full-codebase grep here (too noisy; callers
// elsewhere might exist). The point is: if NEITHER this diff nor
// any other line in the diff reads the field, the PR is shipping
// state without a consumer.
const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++"))
.map(l => l.slice(1));
const newFields = extractNewFields(addedLines);
for (const field of newFields) {
const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`);
// The definition line itself matches readPattern — filter it out
// by requiring at least TWO lines in the diff mention the field
// (one defines, one reads).
const hits = addedLines.filter(l => readPattern.test(l));
if (hits.length < 2) {
findings.push({
check: "static",
severity: "warn",
summary: `field '${field}' added in ${path} but no read-site in the diff — could be placeholder state without a consumer`,
evidence: [`${path}: added '${field}' with no reader; rest of diff has ${hits.length - 1} mentions`],
});
}
}
}
return findings;
}
function splitDiffByFile(diff: string): Map<string, string[]> {
const out = new Map<string, string[]>();
let current: string | null = null;
let buf: string[] = [];
for (const line of diff.split(/\r?\n/)) {
const m = line.match(/^diff --git a\/(\S+) b\/(\S+)/);
if (m) {
if (current) out.set(current, buf);
current = m[2];
buf = [];
continue;
}
buf.push(line);
}
if (current) out.set(current, buf);
return out;
}
// Extract new `pub name: Type,` fields from added lines. Rust syntax.
// Narrowly-scoped: only matches at the start of a trimmed line,
// requires `pub ` prefix, ignores `pub fn` / `pub struct` / etc.
function extractNewFields(addedLines: string[]): string[] {
const fields = new Set<string>();
for (const line of addedLines) {
const t = line.trim();
// pub NAME: Type,
const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/);
if (m) fields.add(m[1]);
}
return Array.from(fields);
}
// True if `pos` falls inside a double- or single-quoted string on this
// line (backtick template literals too). Walks left→right toggling the
// "in quote" state on each unescaped quote. Good enough for single-
// line matches; multi-line strings aren't parsed (they're extremely
// rare in the patterns we're blocking on, and would require a proper
// tokenizer to handle correctly).
function isInsideQuotedString(line: string, pos: number): boolean {
let inDouble = false, inSingle = false, inBacktick = false;
for (let i = 0; i < pos; i++) {
const c = line[i];
const esc = i > 0 && line[i - 1] === "\\";
if (esc) continue;
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
}
return inDouble || inSingle || inBacktick;
}
function escape(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}