// Input normalizer — accepts whatever shape a caller provides (clean // structured spec, partial JSON, natural language, a mix) and produces // a canonical shape the rest of the pipeline understands. This is the // missing piece that lets KB + playbook_memory + overseer respond // "seamlessly with whatever input they're given" as J framed it. // // The pipeline already works when input is `{role, city, state, count}`; // what breaks today is "I need three welders in Nashville Monday" — // the SQL-filter extractors return None, the boost pipeline sees no // role/geo signal, and retrieval silently degrades to raw cosine. // // Strategy: fast path for structured input, LLM fallback only for // natural language. We don't want to burn an LLM call on "role=Welder // city=Nashville state=TN count=3" — that's wasteful. import { generateContinuable } from "./agent.ts"; export interface NormalizedInput { role: string | null; city: string | null; state: string | null; // two-letter count: number | null; client: string | null; deadline: string | null; // ISO date or null intent: "fill" | "lookup" | "rescue" | "rebuild" | "unknown"; raw_text: string; // what the caller originally sent confidence: "high" | "medium" | "low"; extraction_method: "structured" | "regex" | "llm" | "mixed"; missing_fields: string[]; // what normalizer couldn't determine } // US state abbreviations for regex city,state extraction. const STATE_ABBRS = new Set([ "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA", "KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ", "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT", "VA","WA","WV","WI","WY","DC", ]); const STATE_NAMES_TO_ABBR: Record = { alabama: "AL", alaska: "AK", arizona: "AZ", arkansas: "AR", california: "CA", colorado: "CO", connecticut: "CT", delaware: "DE", florida: "FL", georgia: "GA", hawaii: "HI", idaho: "ID", illinois: "IL", indiana: "IN", iowa: "IA", kansas: "KS", kentucky: "KY", louisiana: "LA", maine: "ME", maryland: "MD", massachusetts: "MA", michigan: "MI", minnesota: "MN", mississippi: "MS", missouri: "MO", montana: "MT", nebraska: "NE", nevada: "NV", "new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY", "north carolina": "NC", "north dakota": "ND", ohio: "OH", oklahoma: "OK", oregon: "OR", pennsylvania: "PA", "rhode island": "RI", "south carolina": "SC", "south dakota": "SD", tennessee: "TN", texas: "TX", utah: "UT", vermont: "VT", virginia: "VA", washington: "WA", "west virginia": "WV", wisconsin: "WI", wyoming: "WY", }; // Canonical staffing roles — the normalizer recognizes these regardless // of casing / plurality. Extend as domains grow. const KNOWN_ROLES = [ "Warehouse Associate", "Machine Operator", "Forklift Operator", "Loader", "Material Handler", "Assembler", "Quality Tech", "Picker", "Packer", "Shipping Clerk", "Receiving Clerk", "Welder", "CNC Operator", "Maintenance Tech", "Electrician", "Tool & Die Maker", "Safety Coordinator", "Logistics Coordinator", "Packaging Operator", "Sanitation Worker", "Line Lead", ]; // Intent keyword hints. Check in order — "rescue" beats generic "fill" // for "rescue the Nashville welder fill" kind of input. const INTENT_HINTS: Array<{ re: RegExp; intent: NormalizedInput["intent"] }> = [ { re: /\brescue|pivot|retry|remediat/i, intent: "rescue" }, { re: /\brebuild|reindex|re-?embed/i, intent: "rebuild" }, { re: /\blookup|find|show me|who is|what is/i, intent: "lookup" }, { re: /\bfill|hire|staff|need|schedule|book/i, intent: "fill" }, ]; // Fast structured path — the input is already a FillEvent-shaped JSON // or a close cousin. Returns null to fall through to regex/LLM. function fromStructured(raw: any): NormalizedInput | null { if (!raw || typeof raw !== "object") return null; const role = typeof raw.role === "string" ? raw.role : null; const city = typeof raw.city === "string" ? raw.city : null; const stateRaw = typeof raw.state === "string" ? raw.state : null; const count = typeof raw.count === "number" ? raw.count : null; // Only treat as structured if AT LEAST role OR city is present; a // bare {kind: "baseline_fill"} shouldn't go through this path. if (!role && !city) return null; return { role, city, state: normalizeState(stateRaw), count, client: typeof raw.client === "string" ? raw.client : null, deadline: typeof raw.deadline === "string" ? raw.deadline : null, intent: (raw.kind === "misplacement" || /rescue/i.test(raw.kind ?? "")) ? "rescue" : "fill", raw_text: JSON.stringify(raw), confidence: "high", extraction_method: "structured", missing_fields: missingFields(role, city, stateRaw, count), }; } function normalizeState(s: string | null | undefined): string | null { if (!s) return null; const trimmed = s.trim(); if (STATE_ABBRS.has(trimmed.toUpperCase())) return trimmed.toUpperCase(); const full = trimmed.toLowerCase(); if (STATE_NAMES_TO_ABBR[full]) return STATE_NAMES_TO_ABBR[full]; return null; } function missingFields(role: any, city: any, state: any, count: any): string[] { const out: string[] = []; if (!role) out.push("role"); if (!city) out.push("city"); if (!state) out.push("state"); if (count === null || count === undefined) out.push("count"); return out; } // Regex path — handles structured-ish text like "3 welders in Nashville TN". // Returns partial extraction; LLM path fills the rest if this leaves // too many missing fields. function fromRegex(text: string): NormalizedInput { const lower = text.toLowerCase(); // Count: "3 welders" or "need 5" or "fill 8 safety coordinators" let count: number | null = null; const countMatch = text.match(/\b(?:need|fill|book|hire|schedule|want)\s+(\d+)\b/i) ?? text.match(/\b(\d+)\s+(?:x\s+)?[A-Za-z]/); if (countMatch) count = parseInt(countMatch[1], 10); // Role: longest prefix match from KNOWN_ROLES (case-insensitive). // Plural forms ("welders") match "Welder" via endsWith("s") strip. let role: string | null = null; for (const known of KNOWN_ROLES.sort((a, b) => b.length - a.length)) { const needle = known.toLowerCase(); if (lower.includes(needle)) { role = known; break; } if (needle.endsWith("r") && lower.includes(needle + "s")) { role = known; break; } } // City, state: "Nashville TN" or "Nashville, Tennessee" or "Nashville, TN". // City capture is 1-3 capitalized words — anything longer is usually // the surrounding phrase ("Forklift Operators in Chicago"). Prefer // "in {city}" anchor when present. let city: string | null = null; let state: string | null = null; const cityPat = `([A-Z][a-zA-Z.'-]+(?:\\s+[A-Z][a-zA-Z.'-]+){0,2})`; // First preference: "in {City}, XX" or "in {City} XX" const inAbbrev = text.match(new RegExp(`\\bin\\s+${cityPat},?\\s+([A-Z]{2})\\b`)); if (inAbbrev && STATE_ABBRS.has(inAbbrev[2])) { city = inAbbrev[1].trim(); state = inAbbrev[2]; } else { // Bare "{City}, XX" (no "in" anchor) const bareAbbrev = text.match(new RegExp(`${cityPat},\\s+([A-Z]{2})\\b`)); if (bareAbbrev && STATE_ABBRS.has(bareAbbrev[2])) { city = bareAbbrev[1].trim(); state = bareAbbrev[2]; } else { // "{City}, FullName" variant. Can't use case-insensitive flag // because it would let cityPat's [A-Z] match lowercase letters. // Instead try both exact-case and Title-case variants. for (const [full, abbr] of Object.entries(STATE_NAMES_TO_ABBR)) { const fullTitle = full.split(" ").map(w => w[0].toUpperCase() + w.slice(1)).join(" "); const variants = [full, fullTitle]; for (const variant of variants) { const re = new RegExp(`\\bin\\s+${cityPat},?\\s+${variant}\\b`); const m = text.match(re); if (m) { city = m[1].trim(); state = abbr; break; } } if (city) break; // Bare "{City}, FullName" (no "in") for (const variant of variants) { const re = new RegExp(`${cityPat},\\s+${variant}\\b`); const m = text.match(re); if (m) { city = m[1].trim(); state = abbr; break; } } if (city) break; } } } // Intent let intent: NormalizedInput["intent"] = "unknown"; for (const h of INTENT_HINTS) { if (h.re.test(text)) { intent = h.intent; break; } } const missing = missingFields(role, city, state, count); return { role, city, state, count, client: null, deadline: null, intent, raw_text: text, confidence: missing.length === 0 ? "high" : missing.length <= 2 ? "medium" : "low", extraction_method: "regex", missing_fields: missing, }; } // LLM path — when regex leaves too many fields missing, ask a small // local model to extract what's present. Uses `think:false` and a // strict JSON prompt so the call stays cheap. async function fromLLM(text: string, fallback: NormalizedInput): Promise { const prompt = `Extract the staffing request shape from this text. Respond with ONLY a JSON object, no prose. Text: ${text} Required shape: { "role": "string | null (one of: Welder, Forklift Operator, Warehouse Associate, Machine Operator, Loader, Material Handler, Assembler, Quality Tech, Picker, Packer, Shipping Clerk, Receiving Clerk, CNC Operator, Maintenance Tech, Electrician, Tool & Die Maker, Safety Coordinator, Logistics Coordinator, Packaging Operator, Sanitation Worker, Line Lead, or null if unclear)", "city": "string | null (US city name, properly capitalized)", "state": "string | null (two-letter abbrev like TN, IL)", "count": "number | null (integer number of workers needed)", "client": "string | null (company/client name if mentioned)", "deadline": "string | null (ISO date YYYY-MM-DD if a specific date is mentioned, else null)", "intent": "string (one of: fill, lookup, rescue, rebuild, unknown)" } Return the JSON object now:`; try { const raw = await generateContinuable("qwen3:latest", prompt, { max_tokens: 400, shape: "json", think: false, max_continuations: 2, }); const match = raw.match(/\{[\s\S]*\}/); if (!match) return fallback; const parsed = JSON.parse(match[0]); const role = typeof parsed.role === "string" && parsed.role !== "null" ? parsed.role : null; const city = typeof parsed.city === "string" && parsed.city !== "null" ? parsed.city : null; const state = normalizeState(parsed.state); const count = typeof parsed.count === "number" ? parsed.count : typeof parsed.count === "string" ? parseInt(parsed.count, 10) || null : null; const client = typeof parsed.client === "string" && parsed.client !== "null" ? parsed.client : null; const deadline = typeof parsed.deadline === "string" && parsed.deadline !== "null" ? parsed.deadline : null; const intent = ["fill", "lookup", "rescue", "rebuild", "unknown"].includes(parsed.intent) ? parsed.intent as NormalizedInput["intent"] : "unknown"; const missing = missingFields(role, city, state, count); return { role: role ?? fallback.role, city: city ?? fallback.city, state: state ?? fallback.state, count: count ?? fallback.count, client: client ?? fallback.client, deadline: deadline ?? fallback.deadline, intent: intent !== "unknown" ? intent : fallback.intent, raw_text: text, confidence: missing.length === 0 ? "high" : missing.length <= 2 ? "medium" : "low", extraction_method: "llm", missing_fields: missing, }; } catch { return fallback; } } // Top-level normalizer. Auto-detects input shape: // - Object with role/city → fast structured path // - String with enough regex signal → regex path // - Low-signal string → LLM path with regex seed export async function normalizeInput(raw: unknown): Promise { // Structured path if (raw && typeof raw === "object") { const s = fromStructured(raw); if (s) return s; } // String path const text = typeof raw === "string" ? raw : raw && typeof raw === "object" ? JSON.stringify(raw) : String(raw); const reg = fromRegex(text); // If regex got at least role + city, trust it. if (reg.role && reg.city) { return reg; } // Otherwise try LLM fallback, using regex result as seed. return await fromLLM(text, reg); } // Synchronous-only variant — returns whatever regex/structured can get // without an LLM call. Useful when caller can't await (e.g. a prompt // template). export function normalizeInputSync(raw: unknown): NormalizedInput { if (raw && typeof raw === "object") { const s = fromStructured(raw); if (s) return s; } const text = typeof raw === "string" ? raw : raw && typeof raw === "object" ? JSON.stringify(raw) : String(raw); return fromRegex(text); }