lakehouse/auditor/checks/static.ts

// Static diff check — grep-style, no AST, no LLM. Looks for patterns
// that are high-signal evidence of placeholder code.
//
// Findings are severity-graded:
//   block — explicit non-impl markers (unimplemented!, todo!,
//           panic!("not implemented"), throw new Error("not implemented"))
//   warn  — TODO / FIXME / XXX / HACK comments on added lines,
//           new struct fields with no read-site anywhere in the diff,
//           suspiciously-empty function bodies ({ Ok(()) } / {} when
//           the commit message claims the fn "implements" something)
//   info  — hardcoded "test" / "dummy" / "placeholder" strings in
//           added lines (could be real, just flag for inspection)
//
// Consumes: raw unified diff text from Gitea.

import type { Finding } from "../types.ts";

// Rust + TypeScript patterns that almost always indicate "this is
// not actually implemented yet."
const BLOCK_PATTERNS: Array<{ re: RegExp; why: string }> = [
  { re: /\bunimplemented!\s*\(/, why: "unimplemented!() macro call" },
  { re: /\btodo!\s*\(/, why: "todo!() macro call" },
  { re: /panic!\s*\(\s*"(?:not implemented|TODO|not yet|unimpl)/i, why: "panic! with not-implemented message" },
  { re: /throw\s+new\s+Error\s*\(\s*['"](?:not implemented|TODO|unimpl)/i, why: "throw Error 'not implemented'" },
];

const WARN_COMMENT_PATTERNS: Array<{ re: RegExp; why: string }> = [
  { re: /^\+.*\/\/\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
  { re: /^\+.*#\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
];

const INFO_HARDCODED_PATTERNS: Array<{ re: RegExp; why: string }> = [
  { re: /"(?:placeholder|dummy|foobar|xxx|replaceme|changeme)"/i, why: "suspicious hardcoded string" },
];

export function runStaticCheck(diff: string): Finding[] {
  const findings: Finding[] = [];

  // Per-file walk: only look at ADDED lines (prefix '+' but not '+++'
  // which is the diff header).
  const perFile = splitDiffByFile(diff);

  for (const [path, lines] of perFile) {
    // Skip diff bookkeeping + pure-delete files
    if (!lines.some(l => l.startsWith("+") && !l.startsWith("+++"))) continue;

    // The auditor's own check files literally contain the BLOCK
    // patterns as regex definitions (BLOCK_PATTERNS in this file,
    // prompt examples in inference.ts). Skipping BLOCK scan on these
    // specific paths prevents the checker from self-flagging its own
    // string literals. WARN/INFO patterns still run — those genuinely
    // could indicate problems in the checker's own code (TODO
    // comments don't self-define).
    const isAuditorCheckerFile = path.startsWith("auditor/checks/") ||
                                 path.startsWith("auditor/fixtures/");

    for (let idx = 0; idx < lines.length; idx++) {
      const line = lines[idx];
      if (!line.startsWith("+") || line.startsWith("+++")) continue;
      const added = line.slice(1);

      if (!isAuditorCheckerFile) {
        for (const { re, why } of BLOCK_PATTERNS) {
          const m = added.match(re);
          if (m && typeof m.index === "number") {
            // Skip if the match sits inside a quoted string literal —
            // this is how rubric files (tests/real-world/*, prompt
            // templates) legitimately reference the patterns they
            // guard against, without actually executing them.
            if (isInsideQuotedString(added, m.index)) continue;
            findings.push({
              check: "static",
              severity: "block",
              summary: `${why} in ${path}`,
              evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
            });
          }
        }
      }
      for (const { re, why } of WARN_COMMENT_PATTERNS) {
        if (re.test(line)) {
          findings.push({
            check: "static",
            severity: "warn",
            summary: `${why} in ${path}`,
            evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
          });
        }
      }
      for (const { re, why } of INFO_HARDCODED_PATTERNS) {
        if (re.test(added)) {
          findings.push({
            check: "static",
            severity: "info",
            summary: `${why} in ${path}`,
            evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
          });
        }
      }
    }

    // "Field added but never read" heuristic — catches exactly the
    // Phase 45 DocRef placeholder pattern. Limited to the diff itself:
    // we're not doing a full-codebase grep here (too noisy; callers
    // elsewhere might exist). The point is: if NEITHER this diff nor
    // any other line in the diff reads the field, the PR is shipping
    // state without a consumer.
    //
    // Serde exemption: if the field's parent struct derives Serialize
    // or Deserialize, the read-site is the macro itself — JSON
    // round-trips consume every public field. Without this exemption
    // the check produces false positives on every response/request
    // struct shipped through `/v1/*`.
    const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++"))
      .map(l => l.slice(1));
    const newFields = extractNewFieldsWithLine(lines);
    const seenNames = new Set<string>();
    for (const { name: field, lineIdx } of newFields) {
      if (seenNames.has(field)) continue;
      seenNames.add(field);
      if (parentStructHasSerdeDerive(lines, lineIdx)) continue;
      const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`);
      // The definition line itself matches readPattern — filter it out
      // by requiring at least TWO lines in the diff mention the field
      // (one defines, one reads).
      const hits = addedLines.filter(l => readPattern.test(l));
      if (hits.length < 2) {
        findings.push({
          check: "static",
          severity: "warn",
          summary: `field '${field}' added in ${path} but no read-site in the diff — could be placeholder state without a consumer`,
          evidence: [`${path}: added '${field}' with no reader; rest of diff has ${hits.length - 1} mentions`],
        });
      }
    }
  }

  return findings;
}

function splitDiffByFile(diff: string): Map<string, string[]> {
  const out = new Map<string, string[]>();
  let current: string | null = null;
  let buf: string[] = [];
  for (const line of diff.split(/\r?\n/)) {
    const m = line.match(/^diff --git a\/(\S+) b\/(\S+)/);
    if (m) {
      if (current) out.set(current, buf);
      current = m[2];
      buf = [];
      continue;
    }
    buf.push(line);
  }
  if (current) out.set(current, buf);
  return out;
}

// Extract new `pub name: Type,` fields from the per-file diff lines,
// keeping each occurrence's line index so the caller can resolve the
// parent struct. Same narrow rules as before: starts with `pub `,
// excludes `pub fn` / `pub struct` / etc.
function extractNewFieldsWithLine(lines: string[]): Array<{ name: string; lineIdx: number }> {
  const out: Array<{ name: string; lineIdx: number }> = [];
  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];
    if (!line.startsWith("+") || line.startsWith("+++")) continue;
    const t = line.slice(1).trim();
    const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/);
    if (m) out.push({ name: m[1], lineIdx: i });
  }
  return out;
}

// True if the field at `fieldLineIdx` lives inside a struct whose
// declaration carries `#[derive(... Serialize|Deserialize ...)]`. We
// walk backward through the diff (added + context lines both count —
// a struct declaration unchanged by the PR still appears as context)
// to find the nearest `pub struct` boundary, then scan a few lines
// above it for derive attributes. Conservative bounds:
//   - 80 lines back to find `struct` (struct definitions can grow large)
//   - 8 lines above the `struct` keyword for attribute lines
// Stops the struct-search early if we hit a `}` at zero indent
// (the previous scope) or another `pub struct` (we left ours).
function parentStructHasSerdeDerive(lines: string[], fieldLineIdx: number): boolean {
  let structLineIdx = -1;
  for (let i = fieldLineIdx - 1; i >= 0 && i >= fieldLineIdx - 80; i--) {
    const raw = lines[i];
    if (typeof raw !== "string" || raw.length === 0) continue;
    const body = stripDiffPrefix(raw);
    const trimmed = body.trim();
    if (/^pub\s+struct\s+\w/.test(trimmed)) {
      structLineIdx = i;
      break;
    }
    // Closing brace at column 0 means the enclosing scope ended above
    // the field — we're not actually inside a struct.
    if (body.startsWith("}")) return false;
  }
  if (structLineIdx < 0) return false;

  for (let j = structLineIdx - 1; j >= 0 && j >= structLineIdx - 8; j--) {
    const raw = lines[j];
    if (typeof raw !== "string") continue;
    const trimmed = stripDiffPrefix(raw).trim();
    if (trimmed === "" || trimmed.startsWith("//") || trimmed.startsWith("///")) continue;
    if (!trimmed.startsWith("#[")) break;
    if (/derive\s*\([^)]*\b(Serialize|Deserialize)\b/.test(trimmed)) return true;
  }
  return false;
}

// Strip leading +/-/space from a unified-diff line, leaving the raw
// source line. Handles the case where the line is shorter than 1 char
// (rare but real for empty-context lines).
function stripDiffPrefix(line: string): string {
  if (line.length === 0) return line;
  const c = line[0];
  if (c === "+" || c === "-" || c === " ") return line.slice(1);
  return line;
}

// True if `pos` falls inside a double- or single-quoted string on this
// line (backtick template literals too). Walks left→right toggling the
// "in quote" state on each unescaped quote. Good enough for single-
// line matches; multi-line strings aren't parsed (they're extremely
// rare in the patterns we're blocking on, and would require a proper
// tokenizer to handle correctly).
function isInsideQuotedString(line: string, pos: number): boolean {
  let inDouble = false, inSingle = false, inBacktick = false;
  for (let i = 0; i < pos; i++) {
    const c = line[i];
    const esc = i > 0 && line[i - 1] === "\\";
    if (esc) continue;
    if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
    else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
    else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
  }
  return inDouble || inSingle || inBacktick;
}

function escape(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}