scripts: chicago analyzer field-name fixes + vectorize sanitizer hardening

Two small fixes surfaced during smoke testing: analyze_chicago_contracts.ts: permit field is contact_1_name not contact_1; reported_cost is integer-string. Fixed filter (was rejecting all 2853 permits) and contractor extraction (was empty). vectorize_raw_corpus.ts: sanitize() expanded to strip control chars + ALL backslashes (kills incomplete \uXXXX escapes) + UTF-16 surrogates (unpaired surrogates from emoji split by truncate boundary). Llm_team response cache had docs with all three pollution shapes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 19:34:45 -05:00 · 2026-04-25 19:34:45 -05:00 · 779158a09b
commit 779158a09b
parent 6ac7f61819
2 changed files with 21 additions and 8 deletions
--- a/scripts/analyze_chicago_contracts.ts
+++ b/scripts/analyze_chicago_contracts.ts
@ -62,10 +62,11 @@ async function fetchPermits(n: number): Promise<Permit[]> {
  const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]);
  await proc.exited;
  const all: Permit[] = JSON.parse(await Bun.file(local).text());
-  // Pick high-cost permits with named contractors — most interesting for staffing analysis
+  // Pick high-cost permits with named contractors — most interesting for staffing analysis.
  // Field is `contact_1_name`, not `contact_1`. reported_cost is integer-like string.
  const meaningful = all.filter(p =>
    p.reported_cost && Number(p.reported_cost) >= 100000 &&
-    (p.contact_1 || p.contact_2)
+    (p.contact_1_name || p.contact_2_name)
  );
  log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`);
  // Sample evenly across the meaningful set
@ -79,8 +80,8 @@ async function fetchPermits(n: number): Promise<Permit[]> {
 function permitToText(p: Permit): string {
  const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim();
-  const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? p.contact_1?.contact_name ?? "");
+  const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
-  const c2 = typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? "");
+  const c2 = (p as any).contact_2_name ?? (typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? ""));
  return [
    `Chicago Building Permit ${p.permit_ ?? "?"}`,
    `Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`,
@ -187,7 +188,7 @@ async function analyzeOne(p: Permit, idx: number, total: number): Promise<any> {
  const permitText = permitToText(p);
  // Build matrix query: combine type + work description + contractor name for retrieval anchoring
-  const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? "");
+  const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
  const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`;
  const matrix = await fetchMatrixHits(matrixQuery);
  const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");
--- a/scripts/vectorize_raw_corpus.ts
+++ b/scripts/vectorize_raw_corpus.ts
@ -46,6 +46,18 @@ function truncate(s: string, n = 4000): string {
  return s == null ? "" : (s.length > n ? s.slice(0, n) : s);
 }
 // Sanitize text before posting as JSON. Strips control chars and
 // drops incomplete \uXXXX escape sequences which break Rust's
 // serde JSON parser at the gateway. Llm_team response cache had
 // rows with truncated \u escapes that 400'd the whole batch.
 function sanitize(s: string): string {
  if (!s) return "";
  return s
    .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // strip control chars
    .replace(/\\/g, "") // strip ALL backslashes — kills any malformed \uXXXX in source data
    .replace(/[\uD800-\uDFFF]/g, ""); // strip UTF-16 surrogates (lone ones from emoji split by truncate)
 }
 // ─── EXTRACTORS — one per source ───
 // Each shapes raw rows into {id, text} for the gateway's chunker.
@ -129,13 +141,13 @@ function extractLlmTeamResponseCache(raw: string): Doc[] {
      const text = [
        `Cached response ${r.cache_key ?? r.id ?? i}`,
        `Created: ${r.created_at ?? "?"}`,
-        r.prompt ? `Prompt: ${truncate(r.prompt, 1500)}` : "",
+        r.prompt ? `Prompt: ${sanitize(truncate(r.prompt, 1500))}` : "",
-        r.response ? `Response: ${truncate(r.response, 2500)}` : "",
+        r.response ? `Response: ${sanitize(truncate(r.response, 2500))}` : "",
        r.model ? `Model: ${r.model}` : "",
      ].filter(Boolean).join("\n");
      return { id: `resp_${r.cache_key ?? r.id ?? i}`, text };
    } catch {
-      return { id: `resp_${i}`, text: line.slice(0, 2000) };
+      return { id: `resp_${i}`, text: sanitize(line.slice(0, 2000)) };
    }
  });
 }