diff --git a/scripts/analyze_chicago_contracts.ts b/scripts/analyze_chicago_contracts.ts index 8438bc4..6482629 100644 --- a/scripts/analyze_chicago_contracts.ts +++ b/scripts/analyze_chicago_contracts.ts @@ -62,10 +62,11 @@ async function fetchPermits(n: number): Promise { const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]); await proc.exited; const all: Permit[] = JSON.parse(await Bun.file(local).text()); - // Pick high-cost permits with named contractors — most interesting for staffing analysis + // Pick high-cost permits with named contractors — most interesting for staffing analysis. + // Field is `contact_1_name`, not `contact_1`. reported_cost is integer-like string. const meaningful = all.filter(p => p.reported_cost && Number(p.reported_cost) >= 100000 && - (p.contact_1 || p.contact_2) + (p.contact_1_name || p.contact_2_name) ); log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`); // Sample evenly across the meaningful set @@ -79,8 +80,8 @@ async function fetchPermits(n: number): Promise { function permitToText(p: Permit): string { const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim(); - const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? p.contact_1?.contact_name ?? ""); - const c2 = typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? ""); + const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? "")); + const c2 = (p as any).contact_2_name ?? (typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? "")); return [ `Chicago Building Permit ${p.permit_ ?? "?"}`, `Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`, @@ -187,7 +188,7 @@ async function analyzeOne(p: Permit, idx: number, total: number): Promise { const permitText = permitToText(p); // Build matrix query: combine type + work description + contractor name for retrieval anchoring - const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""); + const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? "")); const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`; const matrix = await fetchMatrixHits(matrixQuery); const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" "); diff --git a/scripts/vectorize_raw_corpus.ts b/scripts/vectorize_raw_corpus.ts index 54641d5..14adac8 100644 --- a/scripts/vectorize_raw_corpus.ts +++ b/scripts/vectorize_raw_corpus.ts @@ -46,6 +46,18 @@ function truncate(s: string, n = 4000): string { return s == null ? "" : (s.length > n ? s.slice(0, n) : s); } +// Sanitize text before posting as JSON. Strips control chars and +// drops incomplete \uXXXX escape sequences which break Rust's +// serde JSON parser at the gateway. Llm_team response cache had +// rows with truncated \u escapes that 400'd the whole batch. +function sanitize(s: string): string { + if (!s) return ""; + return s + .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // strip control chars + .replace(/\\/g, "") // strip ALL backslashes — kills any malformed \uXXXX in source data + .replace(/[\uD800-\uDFFF]/g, ""); // strip UTF-16 surrogates (lone ones from emoji split by truncate) +} + // ─── EXTRACTORS — one per source ─── // Each shapes raw rows into {id, text} for the gateway's chunker. @@ -129,13 +141,13 @@ function extractLlmTeamResponseCache(raw: string): Doc[] { const text = [ `Cached response ${r.cache_key ?? r.id ?? i}`, `Created: ${r.created_at ?? "?"}`, - r.prompt ? `Prompt: ${truncate(r.prompt, 1500)}` : "", - r.response ? `Response: ${truncate(r.response, 2500)}` : "", + r.prompt ? `Prompt: ${sanitize(truncate(r.prompt, 1500))}` : "", + r.response ? `Response: ${sanitize(truncate(r.response, 2500))}` : "", r.model ? `Model: ${r.model}` : "", ].filter(Boolean).join("\n"); return { id: `resp_${r.cache_key ?? r.id ?? i}`, text }; } catch { - return { id: `resp_${i}`, text: line.slice(0, 2000) }; + return { id: `resp_${i}`, text: sanitize(line.slice(0, 2000)) }; } }); }