Compare commits
No commits in common. "main" and "ops/auditor-systemd-units" have entirely different histories.
main
...
ops/audito
@ -1,42 +0,0 @@
|
||||
# Real Archon workflow on the Lakehouse repo, fully via our gateway.
|
||||
# Three Pi nodes, each fires LLM → /v1/chat/completions → OpenRouter,
|
||||
# every call lands a Langfuse trace + observer event.
|
||||
#
|
||||
# Read-only (allowed_tools: [read]). Don't pass --branch / leave
|
||||
# --no-worktree at runtime so Archon doesn't try to create a worktree.
|
||||
name: lakehouse-architect-review
|
||||
description: 'Pi reviews Lakehouse architecture in 3 turns through our gateway.'
|
||||
provider: pi
|
||||
model: openrouter/x-ai/grok-4.1-fast
|
||||
|
||||
nodes:
|
||||
- id: shape
|
||||
prompt: |
|
||||
Read these files and answer in 3 short bullets describing the
|
||||
architectural shape of Lakehouse:
|
||||
- /home/profit/lakehouse/Cargo.toml
|
||||
- /home/profit/lakehouse/lakehouse.toml
|
||||
- /home/profit/lakehouse/docs/MODE_RUNNER_TUNING_PLAN.md
|
||||
Be terse. No preamble.
|
||||
allowed_tools: ["read"]
|
||||
effort: low
|
||||
idle_timeout: 90000
|
||||
|
||||
- id: weakness
|
||||
prompt: |
|
||||
Read /home/profit/lakehouse/crates/gateway/src/v1/mod.rs and
|
||||
identify ONE real weakness or risk. Cite file:line. One paragraph.
|
||||
allowed_tools: ["read"]
|
||||
effort: low
|
||||
idle_timeout: 90000
|
||||
depends_on: [shape]
|
||||
|
||||
- id: improvement
|
||||
prompt: |
|
||||
Based on the prior weakness ($weakness.output), propose ONE
|
||||
surgical improvement (≤6 lines of Rust). Show the patch as
|
||||
`old_string` and `new_string` in markdown code blocks.
|
||||
allowed_tools: []
|
||||
effort: low
|
||||
idle_timeout: 90000
|
||||
depends_on: [weakness]
|
||||
46
.gitignore
vendored
46
.gitignore
vendored
@ -4,49 +4,3 @@
|
||||
.env
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Headshot pool — binary face JPGs are fetched by scripts/staffing/fetch_face_pool.py
|
||||
# (synthetic StyleGAN, ~580MB for 1000 faces). Manifest + fetch script are tracked.
|
||||
data/headshots/face_*.jpg
|
||||
data/headshots/_thumbs/
|
||||
# ComfyUI on-demand generated portraits (per-worker unique). Cached on first
|
||||
# request; fully regeneratable via /headshots/generate/:key.
|
||||
data/headshots_gen/
|
||||
|
||||
# Runtime data — all regeneratable from inputs or accumulated by daemons.
|
||||
# Anything under data/_<name>/ is internal state (auditor outputs, KB caches,
|
||||
# pathway memory snapshots, HNSW trial results, etc.). Anything under
|
||||
# data/datasets/ or data/vectors/ is generated by ingest/index pipelines.
|
||||
data/_*/
|
||||
data/lance/
|
||||
data/datasets/
|
||||
data/vectors/
|
||||
data/demo/
|
||||
data/evidence/
|
||||
data/face_test/
|
||||
data/headshots_role_pool/
|
||||
data/icons_pool/
|
||||
data/scored-runs/
|
||||
data/workspaces/
|
||||
data/catalog/
|
||||
data/**/*.bak-*
|
||||
data/**/*.pre-*-bak
|
||||
|
||||
# Logs
|
||||
logs/
|
||||
|
||||
# Build artifacts
|
||||
node_modules/
|
||||
exports/
|
||||
mcp-server/data/
|
||||
|
||||
# Per-run distillation reports (timestamp-named); keep the parent dir tracked
|
||||
# via .gitkeep if needed but don't carry every batch's report set.
|
||||
reports/distillation/[0-9]*/
|
||||
reports/distillation/*-*-*-*-*/
|
||||
|
||||
# Test scratch — scratchpads, traces, sessions are regenerated each run.
|
||||
# PRD/scenario fixtures stay tracked (they ARE the test).
|
||||
tests/agent_test/_*
|
||||
tests/agent_test/sessions/
|
||||
tests/real-world/runs/
|
||||
|
||||
@ -7,14 +7,6 @@
|
||||
"LAKEHOUSE_URL": "http://localhost:3100",
|
||||
"MCP_TRANSPORT": "stdio"
|
||||
}
|
||||
},
|
||||
"gitea": {
|
||||
"command": "bunx",
|
||||
"args": ["gitea-mcp"],
|
||||
"env": {
|
||||
"GITEA_HOST": "https://git.agentview.dev",
|
||||
"GITEA_ACCESS_TOKEN": "SET_ME_FROM_GITEA_UI_USER_SETTINGS_APPLICATIONS"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
32
Cargo.lock
generated
32
Cargo.lock
generated
@ -46,9 +46,7 @@ dependencies = [
|
||||
name = "aibridge"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum",
|
||||
"lru",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@ -4087,14 +4085,11 @@ dependencies = [
|
||||
"shared",
|
||||
"storaged",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tonic",
|
||||
"tower-http",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"truth",
|
||||
"validator",
|
||||
"vectord",
|
||||
]
|
||||
|
||||
@ -4683,7 +4678,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"croner",
|
||||
"csv",
|
||||
"journald",
|
||||
"lopdf",
|
||||
"mysql_async",
|
||||
"object_store",
|
||||
@ -6901,7 +6895,6 @@ dependencies = [
|
||||
"storaged",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"truth",
|
||||
"url",
|
||||
]
|
||||
|
||||
@ -8733,17 +8726,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "truth"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.5"
|
||||
@ -8910,19 +8892,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "validator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"arrow 55.2.0",
|
||||
"parquet 55.2.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.1"
|
||||
@ -8950,7 +8919,6 @@ dependencies = [
|
||||
"object_store",
|
||||
"parquet 55.2.0",
|
||||
"queryd",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
|
||||
@ -14,8 +14,6 @@ members = [
|
||||
"crates/ui",
|
||||
"crates/lance-bench",
|
||||
"crates/vectord-lance",
|
||||
"crates/truth",
|
||||
"crates/validator",
|
||||
]
|
||||
|
||||
[workspace.dependencies]
|
||||
|
||||
269
STATE_OF_PLAY.md
269
STATE_OF_PLAY.md
@ -1,269 +0,0 @@
|
||||
# STATE OF PLAY — Lakehouse
|
||||
|
||||
**Last verified:** 2026-05-02 evening CDT
|
||||
**Verified by:** live probe (smoke 9/9, parity 32/32, gateway restarted), not memory.
|
||||
|
||||
> **Read this FIRST.** When the user says "we're working on lakehouse," they mean the working code captured below — NOT what `git log` framed as "the cutover" or what memory snapshots from 2 days ago suggest. If memory contradicts this file, this file wins. Update it when something is verified working — not when a phase finishes.
|
||||
|
||||
---
|
||||
|
||||
## WHAT LANDED 2026-05-01 → 2026-05-02 (10 commits this wave)
|
||||
|
||||
| Commit | What | Verified |
|
||||
|---|---|---|
|
||||
| `5d30b3d` | lance: auto-build doc_id btree in `lance_migrate` handler | doc-fetch ~5ms (was ~100ms full scan) on scale_test_10m |
|
||||
| `044650a` | lance-bench: same scalar build post-IVF (matches gateway) | cargo check clean |
|
||||
| `7594725` | lance: 4-pack — `sanitize_lance_err` + 7 unit tests + 9-probe smoke + 10M re-bench | smoke 9/9 PASS, tests 7/7 PASS |
|
||||
| `98b6647` | gateway: `IterateResponse.trace_id` echoed; session_log_path enabled | parity probes see one unified JSONL |
|
||||
| `57bde63` | gateway: trace-id propagation + coordinator session JSONL (Rust parity with Go wave) | session_log_parity 4/4 |
|
||||
| `ba928b1` | aibridge: drop Python sidecar from hot path; AiClient → direct Ollama | aibridge tests 32/32 PASS, /ai/embed live 768d |
|
||||
| `654797a` | gateway: pub `extract_json` + `parity_extract_json` bin | extract_json_parity 12/12 |
|
||||
| `c5654d4` | docs: pointer to `golangLAKEHOUSE/docs/ARCHITECTURE_COMPARISON.md` | — |
|
||||
| `150cc3b` | aibridge: LRU embed cache, 236× RPS warm (78ms → 129us p50) | load test |
|
||||
| `9eed982` | mcp-server: /_go/* pass-through for G5 cutover slice | — |
|
||||
| `6e34ef7` | gitignore: stop tracking 100+ runtime ephemera (data/_*, lance, logs, node_modules) | untracked dropped 100+ → 0 |
|
||||
| `41b0a99` | chore: add 33 real items that were sitting untracked (scripts, scenarios, kimi reports, dev UIs) | clean working tree |
|
||||
|
||||
**Cross-runtime parity (post-this-wave):** 32/32 across 5 probes — `validator(6/6) + extract_json(12/12) + session_log(4/4) + materializer(2/2) + embed(8/8)`. Run `cd /home/profit/golangLAKEHOUSE && for p in scripts/cutover/parity/*.sh; do bash "$p"; done` to re-verify.
|
||||
|
||||
**Lance backend (was untested 5 days ago, now gauntlet-ready):**
|
||||
- `cargo test -p vectord-lance --release` → 7/7 PASS
|
||||
- `./scripts/lance_smoke.sh` → 9/9 PASS against live gateway
|
||||
- `reports/lance_10m_rebench_2026-05-02.md` — search warm ~20ms / cold ~46ms median, doc-fetch ~5ms post-btree
|
||||
|
||||
---
|
||||
|
||||
## VERIFIED WORKING RIGHT NOW
|
||||
|
||||
### The client demo (Staffing Co-Pilot)
|
||||
|
||||
**Public URL:** `https://devop.live/lakehouse/` — 200, "Staffing Co-Pilot" (159 KB SPA, leaflet maps, dark theme).
|
||||
**Local URL:** `http://localhost:3700/` — same page, served by `mcp-server/index.ts` (PID 1271, started 09:48 CDT today).
|
||||
|
||||
**The staffers console** (the one the client was thoroughly impressed with):
|
||||
- `https://devop.live/lakehouse/console` — 200, "Lakehouse — What Your Staffing System Would Do" (26 KB)
|
||||
- Pulls project index via `/api/catalog/datasets` (36 datasets) + playbook memory via `/api/vectors/playbook_memory/stats` (4,701 entries with embeddings, real ops like *"fill: Maintenance Tech x2 in Milwaukee, WI"*)
|
||||
|
||||
Client-visible flow that works end-to-end on the public URL:
|
||||
|
||||
| Endpoint | Sample output |
|
||||
|---|---|
|
||||
| `GET /api/catalog/datasets` | 36 datasets indexed: timesheets 1M, call_log 800K, workers_500k 500K, email_log 500K, workers_100k 100K, candidates 100K, placements 50K, job_orders 15K, successful_playbooks_live 2,077 |
|
||||
| `GET /api/vectors/playbook_memory/stats` | 4,701 fill operations with embeddings |
|
||||
| `GET /system/summary` | 36 datasets, 2.98M rows, 60 indexes, 500K workers loaded, 1K candidates |
|
||||
| `POST /intelligence/staffing_forecast` | 744 Production Workers needed in 30d, 11,281 bench (4,687 reliable), coverage 1,444%, risk=ok. Same for Electrician (need 32, bench 2,440) and Maintenance Tech (need 17, bench 5,004). |
|
||||
| `POST /intelligence/permit_contracts` | permit `3442956` $500K → 3 Production Workers, 886-candidate pool, 95% fill, $36K gross. 5 more Chicago permits with 8 workers each, same pool, 95% fill, $96K each. |
|
||||
| `POST /intelligence/market` | major Chicago permits ranked: $730M O'Hare, $615M 307 N Michigan, $580M casino, $445M Loop transit (real geo coords). |
|
||||
| `POST /intelligence/permit_entities` | architects + contractors from permit contacts (e.g. "KACPRZYNSKI, ANDY", "SLS ELECTRICAL SERVICE"). |
|
||||
| `POST /intelligence/activity` + `/intelligence/arch_signals` + `/intelligence/chat` | all 200 |
|
||||
|
||||
The demo tells the story: *"upcoming Chicago contracts → workers needed → coverage from the bench → architects/contractors involved → revenue and margin."* That's the "live data + anticipating contracts + complete workflow" pitch — working as of right now.
|
||||
|
||||
### Backend, verified live this session
|
||||
|
||||
| Surface | State |
|
||||
|---|---|
|
||||
| Gateway `:3100` | up, 4 providers configured, `/v1/health` 200 with 500K workers loaded |
|
||||
| MCP server `:3700` (Co-Pilot demo) | up, all `/intelligence/*` endpoints respond |
|
||||
| VCP UI `:3950` | started this session, `/data/*` 200, real numbers |
|
||||
| Observer `:3800` | ring full (2,000/2,000) — older events evicted, query Langfuse for 24h-ago state |
|
||||
| Sidecar `:3200` | up |
|
||||
| Langfuse `:3001` | recording, `gw:/log` + `v1.chat:openrouter` traces visible |
|
||||
| LLM Team UI `:5000` | up, only `extract` mode registered |
|
||||
| OpenCode fleet | **40 models reachable through one `sk-*` key** (verified live `GET https://opencode.ai/zen/v1/models`) |
|
||||
|
||||
OpenCode catalog (live):
|
||||
- Claude: opus-4-7, opus-4-6, opus-4-5, opus-4-1, sonnet-4-6, sonnet-4-5, sonnet-4, haiku-4-5
|
||||
- GPT-5: 5.5-pro, 5.5, 5.4-pro, 5.4, 5.4-mini, 5.4-nano, 5.3-codex-spark, 5.3-codex, 5.2, 5.2-codex, 5.1-codex-max, 5.1-codex, 5.1-codex-mini, 5.1, 5-codex, 5-nano, 5
|
||||
- Gemini: 3.1-pro, 3-flash
|
||||
- GLM: 5.1, 5
|
||||
- Minimax: m2.7, m2.5
|
||||
- Kimi: k2.6, k2.5
|
||||
- Qwen: 3.6-plus, 3.5-plus
|
||||
- Other: BIG-PKL (was a typo-prone name in the catalog, model id starts with "big-pkl-something")
|
||||
- Free tier: minimax-m2.5-free, hy3-preview-free, ling-2.6-flash-free, trinity-large-preview-free
|
||||
|
||||
### The substrate (frozen — do not re-architect)
|
||||
|
||||
- Distillation v1.0.0 at tag `e7636f2` — **145/145 bun tests pass, 22/22 acceptance, 16/16 audit-full**
|
||||
- Output: `data/_kb/distilled_{facts,procedures,config_hints}.jsonl` + `data/vectors/distilled_{factual,procedural,config_hint}_v20260423102847.parquet`
|
||||
- Auditor cross-lineage: Kimi K2.6 ↔ Haiku 4.5 alternation, Opus auto-promote on diffs >100k chars, **per-PR cap=3 with auto-reset on new head SHA**
|
||||
- Pathway memory: 88 traces, 11/11 successful replays (probation gate crossed)
|
||||
- Mode runner: 5 native modes; `codereview_isolation` is default; composed-corpus auto-downgrade verified Apr 26 (composed lost 5/5 vs isolation, p=0.031)
|
||||
|
||||
### Matrix indexer
|
||||
|
||||
30+ live corpora including:
|
||||
- 5 versions of `workers_500k_v1..v9` (50K embedded chunks each)
|
||||
- 11 batched 2K-row shards `w500k_b3..b17`
|
||||
- `chicago_permits_v1` (3,420), `resumes_100k_v2` (100K candidates), `ethereal_workers_v1` (10K)
|
||||
- `lakehouse_arch_v1` (2,119), `lakehouse_symbols_v1` (2,470), `lakehouse_answers_v1` (1,269), `scrum_findings_v1` (1,260)
|
||||
- `kb_team_runs_v1` (12,693) + `kb_team_runs_agent` (4,407) — LLM-team play history embedded
|
||||
- `distilled_factual_v20260423102507` (8) — distillation output
|
||||
|
||||
### Code health
|
||||
|
||||
- `cargo check --workspace` → **0 warnings, 0 errors**
|
||||
- `bun test auditor + tests/distillation` → **145/145 pass**
|
||||
- `ui/server.ts` + `auditor.ts` bundle clean
|
||||
|
||||
---
|
||||
|
||||
## DO NOT RELITIGATE
|
||||
|
||||
- **PR #11 is merged into `origin/main` as `ed57eda`** — do not "still need to merge PR #11."
|
||||
- **Distillation tag `distillation-v1.0.0` at `e7636f2` is FROZEN** — do not re-architect schemas, scorer rules, audit fixtures.
|
||||
- **Kimi forensic HOLD verdict (2026-04-27) was 2/8 false + 6/8 latent** — do not re-debate, see `reports/kimi/audit-last-week-full.md`.
|
||||
- **`candidates_safe` `vertical` column bug** — fixed at catalog metadata layer in commit `c3c9c21`. Do not "discover" it again.
|
||||
- **Decisions A/B/C/D from `synthetic-data-gap-report.md`** — all four scripts shipped today (`d56f08e`, `940737d`, `c3c9c21`). Do not "ask J for approval."
|
||||
- **`workers_500k.phone` type fixup** — already string. The fixup script is idempotent; running it is a no-op.
|
||||
- **`client_workerskjkk` typo dataset** — was breaking every SQL query (catalog had it registered, file didn't exist). Removed via `DELETE /catalog/datasets/by-name/client_workerskjkk` this session. Do not re-add. Adding a startup gate that errors on unrecognized parquet names is the long-term fix per now.md Step 2C.
|
||||
- **Python sidecar dropped from hot path 2026-05-02 (`ba928b1`)** — AiClient calls Ollama directly. Do not "wire python embedding back in." `lab_ui.py` + `pipeline_lab.py` keep running as dev-only UIs (not on the runtime path).
|
||||
- **Lance backend gauntlet (2026-05-02)** — sanitizer over all 5 routes, 7 unit tests, 9-probe smoke, 10M re-bench. The `doc_id` btree auto-builds inside `lance_migrate` AND `lance-bench`. Do not "discover" the missing scalar index again or the leaked filesystem paths in error bodies.
|
||||
- **Cross-runtime parity = 32/32** across 5 probes in `golangLAKEHOUSE/scripts/cutover/parity/`. Do not "build a parity probe for X" without checking — validator, extract_json, session_log, materializer, and embed are all already covered.
|
||||
- **Decisions tracker is `golangLAKEHOUSE/docs/ARCHITECTURE_COMPARISON.md`** — single living source of truth for cross-runtime decisions. As of 2026-05-02 it has 0 `_open_` code work items; only 2 strategic items left (Lance vs Parquet+HNSW-with-spilling, Go-vs-Rust primary cutover).
|
||||
|
||||
---
|
||||
|
||||
## FIXES MADE THIS SESSION (2026-04-27 evening)
|
||||
|
||||
1. **`crates/gateway/src/v1/iterate.rs:93`** — `state` → `_state` (cleared the one cargo warning).
|
||||
2. **`lakehouse-ui.service` (Dioxus)** — disabled. Was failing 7,242 times against a missing `target/dx/ui/debug/web/public` build dir. `systemctl stop && disable`.
|
||||
3. **VCP UI on `:3950`** — started `bun run ui/server.ts` (PID 1162212, log `/tmp/lakehouse_ui.log`). `/data/*` endpoints now 200 with real data.
|
||||
4. **`client_workerskjkk` catalog entry** — `DELETE /catalog/datasets/by-name/client_workerskjkk` removed the dead manifest. **This was the actual root cause** of `/system/summary` reporting `workers_500k_rows: 0` and the demo showing zero bench. Every SQL query was failing schema inference on the missing file before reaching its target table. Fixed → `workers_500k_rows: 500000`, `candidates_rows: 1000`, demo coverage flipped from "critical 0%" to actual percentages on devop.live/lakehouse.
|
||||
|
||||
## FIXES MADE THIS SESSION (2026-04-28 early — face pool)
|
||||
|
||||
5. **Synthetic StyleGAN face pool — 1000 faces, gender+race+age tagged.** `scripts/staffing/fetch_face_pool.py` fetches from thispersondoesnotexist.com; `scripts/staffing/tag_face_pool.py --min-age 22` runs deepface and excludes minors. `data/headshots/manifest.jsonl` now has gender (494 men / 458 women), race (caucasian 662 · east_asian 128 · hispanic 86 · middle_eastern 59 · black 14 · south_asian 3), age, and 48 minor exclusions. Server pool = 952 servable faces.
|
||||
6. **`mcp-server/index.ts:1308` `/headshots/:key` route** — gender×race×age intersection bucketing with graceful fallback (gender-only → all). Same key always returns same face; different keys spread evenly.
|
||||
7. **`/headshots/_thumbs/` pre-resized 384×384 webp** (60× smaller: 587KB → ~11KB). Without this, 40-card grids overran Chrome's parallel-connection budget and ~75% of tiles never finished decoding. Generated via parallel ffmpeg (`xargs -P 8`); `.gitignore`d.
|
||||
8. **`mcp-server/search.html` + `console.html`** — dropped `img.loading='lazy'`. With 11KB thumbs, eager load is cheap (~500KB for 50 cards) and avoids the off-screen race that lazy decode produced.
|
||||
9. **ComfyUI on-demand uniqueness — `serve_imagegen.py:32`** added `seed` to `_cache_key()` (was caching by prompt only — 3 different worker seeds collapsed to 1 cached image). Verified: seed=839185194/195/196 → 3 distinct md5s.
|
||||
10. **`mcp-server/index.ts:1234` `/headshots/generate/:key`** — ComfyUI hot-path that derives a deterministic-per-worker seed via djb2-style hash; cold ~1.5s, cached ~1ms. Worker prompt format: `professional corporate headshot portrait of a {age}-year-old {race} {gender}, {role}, neutral expression, plain studio background, soft natural lighting, sharp focus, photorealistic, dslr`. Cache at `data/headshots_gen/` (gitignored, regeneratable).
|
||||
11. **Confidence-default name resolution** in `search.html` — `genderFor()` and `guessEthnicityFromFirstName()` lookup tables (FEMALE_NAMES, MALE_NAMES, NAMES_HISPANIC, NAMES_BLACK, NAMES_SOUTH_ASIAN, NAMES_EAST_ASIAN, NAMES_MIDDLE_EASTERN). Xavier → man+hispanic, Aisha → woman+black, etc. Every worker resolves to a face-pool bucket.
|
||||
|
||||
End-to-end verified: playwright run on `https://devop.live/lakehouse/?q=forklift+operators+IL` → 21/21 cards loaded, 0 broken, all 384×384 webp thumbs.
|
||||
|
||||
---
|
||||
|
||||
## OPEN — but not blocking the demo
|
||||
|
||||
| Item | What | When to act |
|
||||
|---|---|---|
|
||||
| `modes.toml` `staffing_inference.matrix_corpus` | still says `workers_500k_v8`. v9 in vector index is from Apr 17 (raw-sourced, not safe-view). The new `build_workers_v9.sh` rebuilds from `workers_safe`. | Run when you have 30+ min for the rebuild. |
|
||||
| Open PRs #6, #7, #10 | sitting since Apr 22-24, auditor verdicts on disk at `data/_auditor/kimi_verdicts/{6,7,10}-*.json` | Read verdicts, decide reconcile/close. |
|
||||
| `test/enrich-prd-pipeline` branch | 35 unmerged commits, includes more-evolved auditor/inference.ts (666 vs main's 580 lines), curation+fact-extractor wiring | Reconcile or formally archive — see `memory/project_unmerged_architecture_work.md`. |
|
||||
| `federation-hnsw-trials` stash | Lance + S3/MinIO prototype, `aws-config` crate added, 708 insertions | Phase B from EXECUTION_PLAN.md — revisit when Parquet vector ceiling actually hurts. |
|
||||
| `candidates` manifest drift | manifest 100K vs SQL 1K. Cosmetic. | Run a metadata resync if it matters. |
|
||||
|
||||
---
|
||||
|
||||
## RUNTIME CHEATSHEET
|
||||
|
||||
```bash
|
||||
# Verify the demo (public + local both work)
|
||||
curl -sS https://devop.live/lakehouse/ # Co-Pilot HTML
|
||||
curl -sS https://devop.live/lakehouse/console # staffers console
|
||||
curl -sS -X POST https://devop.live/lakehouse/intelligence/staffing_forecast \
|
||||
-d '{}' -H 'content-type: application/json' \
|
||||
| jq '.forecast[] | {role, demand_workers, bench_total, coverage_pct, risk}'
|
||||
|
||||
# Restart sequence (after Rust changes)
|
||||
sudo systemctl restart lakehouse.service # gateway :3100
|
||||
sudo systemctl restart lakehouse-auditor # auditor daemon
|
||||
sudo systemctl restart lakehouse-observer # observer :3800
|
||||
# UI bun on :3950 is NOT systemd-managed (lakehouse-ui.service is disabled).
|
||||
# Restart manually: kill <pid>; nohup bun run ui/server.ts > /tmp/lakehouse_ui.log 2>&1 &
|
||||
|
||||
# Health checks
|
||||
curl -sS http://localhost:3100/v1/health | jq # workers_count, providers
|
||||
curl -sS http://localhost:3100/vectors/pathway/stats | jq
|
||||
curl -sS http://localhost:3100/v1/usage | jq # since-restart cost
|
||||
curl -sS http://localhost:3700/system/summary | jq # dataset counts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## VISION — what we're actually building (not what's done)
|
||||
|
||||
J's framing for the legacy staffing company:
|
||||
|
||||
- Pull live data, anticipate contracts based on Chicago permits → real architect/contractor associations, headcount, time period, money, scope.
|
||||
- Hybrid + memory index → search large corpora cheaply.
|
||||
- Email comes in → verify against contract; SMS comes in → alert when index changes.
|
||||
- Real-time.
|
||||
- Invent metrics nobody else has using the hybrid index.
|
||||
- Next stage: workers download an app → geolocation clock-in → automatic responsiveness measurement, no user effort, with incentives for using it.
|
||||
- Find people getting certificates (passive cert tracking).
|
||||
- Pull union data → bring contracts that work for **employees**, not just employers.
|
||||
- All metrics visible, nothing hidden, value-aligned with what each side actually needs.
|
||||
|
||||
If a future session is shaving away from this vision toward "fix the cutover" or "land Phase X," the vision wins. Phases are scaffolding for the vision, not the goal.
|
||||
|
||||
---
|
||||
|
||||
## CURRENT PLAN — fix the demo for the legacy staffing client
|
||||
|
||||
Built from playwright audit of the live demo (2026-04-27 evening). Each item ends in something the client can SEE, not internal cleanups.
|
||||
|
||||
**Demo state is anchored by git tag `demo-2026-04-27`** (commit `ed57eda`, the merge of PR #11). To restore code state: `git checkout demo-2026-04-27`. To restore runtime state: `DELETE /catalog/datasets/by-name/client_workerskjkk` (catalog hot-fix is not in git).
|
||||
|
||||
### P1 — Search box that actually filters (highest visible impact)
|
||||
|
||||
**Problem:** typing in `#sq` and pressing Enter fires `POST /intelligence/chat` with body `{"message":"<query>"}`. The state (`#sst`) and role (`#srl`) selects are ignored — never sent in the body. So every search returns a generic chat completion, never a SQL+vector hybrid filter against `workers_500k`. That is the "cached/generic response" the client sees.
|
||||
|
||||
**Fix:** in `mcp-server/search.html`, change the search-submit handler to call the real worker search endpoint with `{query, state, role, top_k}`. The MCP `search_workers` tool surface already exists; route the form there. Render returned worker rows in the existing card grid.
|
||||
|
||||
**Done when:** typing "forklift" + state IL + role "Forklift Operator" returns ≤ top_k IL Forklift Operators, and changing state to WI returns different workers.
|
||||
|
||||
### P2 — Contractor-name click → `/contractor` profile page
|
||||
|
||||
**Problem:** clicking a contractor name in any rendered card stays on `/lakehouse/`. URL doesn't change.
|
||||
|
||||
**Fix:** wrap contractor names in `<a href="/contractor?name=<encoded>">`. The page `mcp-server/contractor.html` (14.8 KB, "Contractor Profile · Staffing Co-Pilot") already exists at `/contractor` and the data endpoint `/intelligence/contractor_profile` already returns rich data.
|
||||
|
||||
**Then check contractor.html actually shows:** full history of every record the database has on that contractor + heat map of locations underneath + relevant info (per J 2026-04-27). If the page is incomplete, finish it. Otherwise just wire the link.
|
||||
|
||||
**Done when:** clicking "KACPRZYNSKI, ANDY" opens a profile with: every Chicago permit they're contact_1 or contact_2 on, a leaflet map with markers for each address, and any matched workers from prior placements at their sites.
|
||||
|
||||
### P3 — Substrate signal at the bottom shows the right numbers
|
||||
|
||||
**Problem:** J reports the bottom panel says "playbook memory empty, 80 traces 0 replies." Reality from the live endpoints: `/api/vectors/playbook_memory/stats` = 4,701 entries with embeddings; `/vectors/pathway/stats` = 88 traces, 11/11 replays.
|
||||
|
||||
**Fix:** find the renderer in search.html that builds the substrate signal panel; verify it's hitting the right endpoints and reading the right keys; fix shape mismatches.
|
||||
|
||||
**Done when:** bottom panel shows real numbers (4,701 playbooks, 88 traces, 11/11 replays) and references at least one specific recent operation from the playbook stats sample.
|
||||
|
||||
### P4 — Top nav reflects today's architecture
|
||||
|
||||
**Problem:** Walkthrough/Architecture/Spec/Onboard/Alerts/Workspaces tabs all return 200 but content is from old architecture. Doesn't mention: gateway scratchpad, memory indexer, ranker, mode runner, OpenCode 40-model fleet, distillation substrate, auditor cross-lineage.
|
||||
|
||||
**Fix:** rewrite `mcp-server/proof.html` (or add a single new page "What's running" that replaces Architecture+Spec) to describe what's actually shipped as of `demo-2026-04-27`. Keep one architecture page, drop redundancy. Either complete or hide Onboard/Alerts/Workspaces — J's call which.
|
||||
|
||||
**Done when:** the architecture page tells a non-technical reader, in 2 minutes, what each piece does in coordinator-relatable terms ("intern that read every email", not "3-stage adversarial inference pipeline").
|
||||
|
||||
### P5 — Caching for the project-index build_signal (J flagged unfinished)
|
||||
|
||||
**Problem:** "we never finished our caching for project index build signal it's not pulling new information." Need to find what `build_signal` refers to. Likely a scrum/auditor signal that should rebuild the `lakehouse_arch_v1` corpus on commit but isn't wired to.
|
||||
|
||||
**Fix:** identify the build-signal pipeline (likely in `auditor/` or `crates/vectord/`), wire its emit to a corpus rebuild, verify by making a test commit and watching the new chunk appear in `/vectors/indexes` for `lakehouse_arch_v1`.
|
||||
|
||||
**Done when:** committing a new file to `crates/` causes `lakehouse_arch_v1` chunk_count to increase within N minutes.
|
||||
|
||||
### P0 — Anchor the demo state (DONE)
|
||||
|
||||
Tagged `ed57eda` as `demo-2026-04-27`. Future sessions: `git checkout demo-2026-04-27` to land in this exact code state.
|
||||
|
||||
---
|
||||
|
||||
## EXECUTION ORDER
|
||||
|
||||
1. **P1 first** — biggest visible bug, ~30-60 min
|
||||
2. **P2 next** — contractor click is the second-biggest "doesn't work" the client sees, ~20 min if profile is mostly done
|
||||
3. **P3** — small fix, big "looks alive" win
|
||||
4. **P4** — biggest scope; might split across sessions
|
||||
5. **P5** — feature work, only after the visible bugs are fixed
|
||||
|
||||
Each item commits independently with the format `demo: P<n> — <one-line>` so the commit log doubles as a progress journal. After each merge to main, re-tag `demo-latest` to point at the new HEAD.
|
||||
|
||||
Stop here and let J pick which item to start with. Do not silently extend scope.
|
||||
@ -33,13 +33,6 @@ Defaults: polls every 90s, stops on `auditor.paused` file present.
|
||||
|
||||
- `data/_auditor/state.json` — last-audited head SHA per PR
|
||||
- `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record
|
||||
- `data/_kb/audit_lessons.jsonl` — one row per block/warn finding,
|
||||
path-agnostic signature for dedup. Tailed by kb_query on each audit
|
||||
to surface recurring patterns (2+ distinct PRs with same signature
|
||||
→ info, 3-4 → warn, 5+ → block). This is how the auditor learns.
|
||||
- `data/_kb/scrum_reviews.jsonl` — scrum-master per-file reviews. If
|
||||
a file in the current PR has been scrum-reviewed, kb_query surfaces
|
||||
the review as a finding with the accepted model and attempt count.
|
||||
|
||||
## Where YOU edit
|
||||
|
||||
|
||||
@ -12,8 +12,7 @@
|
||||
// review — reviews have self-review restrictions on Gitea and the
|
||||
// auditor currently uses the same PAT as the PR author).
|
||||
|
||||
import { readFile, writeFile, mkdir, appendFile } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import type { PrSnapshot, Verdict, Finding } from "./types.ts";
|
||||
import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
|
||||
@ -23,13 +22,8 @@ import { runStaticCheck } from "./checks/static.ts";
|
||||
import { runDynamicCheck } from "./checks/dynamic.ts";
|
||||
import { runInferenceCheck } from "./checks/inference.ts";
|
||||
import { runKbCheck } from "./checks/kb_query.ts";
|
||||
import { runKimiArchitectCheck } from "./checks/kimi_architect.ts";
|
||||
|
||||
const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
|
||||
// Playbook for audit findings — one row per block/warn finding from a
|
||||
// verdict. kb_query tails this next audit and escalates recurrences.
|
||||
// Structured as JSONL so it's cheap to append and cheap to tail.
|
||||
const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
|
||||
|
||||
export interface AuditOptions {
|
||||
// Skip the cloud inference call (fast path for iteration). Default false.
|
||||
@ -57,8 +51,8 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([
|
||||
runStaticCheck(diff),
|
||||
opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
|
||||
opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff, { pr_number: pr.number, head_sha: pr.head_sha }),
|
||||
runKbCheck(claims, pr.files.map(f => f.path)),
|
||||
opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
|
||||
runKbCheck(claims),
|
||||
]);
|
||||
|
||||
const allFindings: Finding[] = [
|
||||
@ -68,29 +62,6 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
...kbFindings,
|
||||
];
|
||||
|
||||
// Kimi-architect second-pass review. Off by default; enabled with
|
||||
// LH_AUDITOR_KIMI=1. Sequential (not in the parallel block above)
|
||||
// because it consumes the prior findings as context — Kimi sees what
|
||||
// deepseek already flagged and is asked "what did everyone miss?"
|
||||
// Failure-isolated by design: any error returns a single info-level
|
||||
// skip finding so the existing audit pipeline never blocks on Kimi.
|
||||
if (process.env.LH_AUDITOR_KIMI === "1") {
|
||||
try {
|
||||
const kimiFindings = await runKimiArchitectCheck(diff, allFindings, {
|
||||
pr_number: pr.number,
|
||||
head_sha: pr.head_sha,
|
||||
});
|
||||
allFindings.push(...kimiFindings);
|
||||
} catch (e) {
|
||||
allFindings.push({
|
||||
check: "kimi_architect",
|
||||
severity: "info",
|
||||
summary: `kimi_architect outer error — ${(e as Error).message.slice(0, 160)}`,
|
||||
evidence: [(e as Error).stack?.slice(0, 360) ?? ""],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const duration_ms = Date.now() - t0;
|
||||
const metrics = {
|
||||
audit_duration_ms: duration_ms,
|
||||
@ -101,7 +72,6 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
claims_strong: claims.filter(c => c.strength === "strong").length,
|
||||
claims_moderate: claims.filter(c => c.strength === "moderate").length,
|
||||
claims_weak: claims.filter(c => c.strength === "weak").length,
|
||||
claims_empirical: claims.filter(c => c.strength === "empirical").length,
|
||||
claims_total: claims.length,
|
||||
diff_bytes: diff.length,
|
||||
};
|
||||
@ -110,15 +80,6 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
|
||||
await persistVerdict(verdict);
|
||||
|
||||
// Feedback loop — every block/warn finding becomes a row in
|
||||
// audit_lessons.jsonl, dedup-keyed by (check, normalized-summary).
|
||||
// The next audit's kb_query reads these and escalates recurring
|
||||
// findings so we don't lose the "this pattern has been flagged
|
||||
// before" signal across runs. Fire-and-forget; failure here must
|
||||
// not break the audit.
|
||||
appendAuditLessons(verdict).catch(e =>
|
||||
console.error(`[audit] audit_lessons append failed: ${(e as Error).message}`));
|
||||
|
||||
if (!opts.dry_run) {
|
||||
await postToGitea(verdict);
|
||||
}
|
||||
@ -126,42 +87,6 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
return verdict;
|
||||
}
|
||||
|
||||
// Normalizes a finding summary for dedup: strips path-specific tails
|
||||
// ("in path/to/file.ts" → "in <file>"), line numbers, and long
|
||||
// commit-hash snippets. The goal is: the SAME class of finding on
|
||||
// DIFFERENT files should share a signature, so we can measure
|
||||
// "this pattern keeps showing up."
|
||||
function normalizedSignature(f: Finding): string {
|
||||
const summary = String(f.summary)
|
||||
.replace(/\bin\s+\S+\.(ts|rs|js|py|md)\b/gi, "in <file>")
|
||||
.replace(/:\+?\d+\b/g, ":<line>")
|
||||
.replace(/[0-9a-f]{8,}/gi, "<hash>")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim()
|
||||
.slice(0, 240);
|
||||
const src = `${f.check}::${f.severity}::${summary}`;
|
||||
return createHash("sha256").update(src).digest("hex").slice(0, 16);
|
||||
}
|
||||
|
||||
async function appendAuditLessons(v: Verdict): Promise<void> {
|
||||
const actionable = v.findings.filter(f => f.severity === "block" || f.severity === "warn");
|
||||
if (actionable.length === 0) return;
|
||||
await mkdir(join(AUDIT_LESSONS_JSONL, ".."), { recursive: true });
|
||||
const rows: string[] = [];
|
||||
for (const f of actionable) {
|
||||
rows.push(JSON.stringify({
|
||||
signature: normalizedSignature(f),
|
||||
check: f.check,
|
||||
severity: f.severity,
|
||||
summary: f.summary,
|
||||
pr_number: v.pr_number,
|
||||
head_sha: v.head_sha,
|
||||
audited_at: v.audited_at,
|
||||
}));
|
||||
}
|
||||
await appendFile(AUDIT_LESSONS_JSONL, rows.join("\n") + "\n");
|
||||
}
|
||||
|
||||
async function persistVerdict(v: Verdict): Promise<void> {
|
||||
await mkdir(VERDICTS_DIR, { recursive: true });
|
||||
const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
|
||||
@ -208,7 +133,7 @@ function formatReviewBody(v: Verdict): string {
|
||||
lines.push("");
|
||||
|
||||
// Per-check sections, only if the check produced findings.
|
||||
const checkOrder = ["static", "dynamic", "inference", "kb_query", "kimi_architect"] as const;
|
||||
const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const;
|
||||
for (const check of checkOrder) {
|
||||
const fs = byCheck[check] ?? [];
|
||||
if (fs.length === 0) continue;
|
||||
@ -241,6 +166,6 @@ function formatReviewBody(v: Verdict): string {
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function stubFinding(check: "dynamic" | "inference" | "kimi_architect", why: string): Finding[] {
|
||||
function stubFinding(check: "dynamic" | "inference", why: string): Finding[] {
|
||||
return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }];
|
||||
}
|
||||
|
||||
@ -1,68 +0,0 @@
|
||||
// One-shot dry-run audit of a single PR. Useful for verifying check
|
||||
// behavior (kb_query scrum surfacing, inference prompts, etc.) without
|
||||
// posting to Gitea. Does NOT touch state.json and does NOT post
|
||||
// commit status or PR comments.
|
||||
//
|
||||
// Run: bun run auditor/audit_one.ts <pr-number>
|
||||
|
||||
import { getPrSnapshot } from "./gitea.ts";
|
||||
import { auditPr } from "./audit.ts";
|
||||
|
||||
async function main() {
|
||||
const prNumRaw = process.argv[2];
|
||||
if (!prNumRaw) {
|
||||
console.error("usage: bun run auditor/audit_one.ts <pr-number>");
|
||||
process.exit(2);
|
||||
}
|
||||
const prNum = Number(prNumRaw);
|
||||
if (!Number.isFinite(prNum)) {
|
||||
console.error(`invalid PR number: ${prNumRaw}`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
console.log(`[audit_one] fetching PR #${prNum}...`);
|
||||
const pr = await getPrSnapshot(prNum);
|
||||
console.log(`[audit_one] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
|
||||
console.log(`[audit_one] files in diff: ${pr.files.length}`);
|
||||
for (const f of pr.files) console.log(` - ${f.path} (+${f.additions}/-${f.deletions})`);
|
||||
console.log("");
|
||||
|
||||
const verdict = await auditPr(pr, {
|
||||
dry_run: true, // no Gitea posting
|
||||
skip_dynamic: true, // don't run fixture
|
||||
skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
|
||||
});
|
||||
|
||||
console.log("\n═══ VERDICT ═══");
|
||||
console.log(`overall: ${verdict.overall}`);
|
||||
console.log(`one-liner: ${verdict.one_liner}`);
|
||||
console.log(`findings: total=${verdict.metrics.findings_total} block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn} info=${verdict.metrics.findings_info}`);
|
||||
console.log("");
|
||||
|
||||
// Print findings, highlighting kb_query scrum surfacing
|
||||
const byCheck: Record<string, typeof verdict.findings> = {};
|
||||
for (const f of verdict.findings) (byCheck[f.check] ||= []).push(f);
|
||||
|
||||
for (const [check, findings] of Object.entries(byCheck)) {
|
||||
console.log(`── ${check} (${findings.length}) ──`);
|
||||
for (const f of findings) {
|
||||
const tag = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️ " : "ℹ️ ";
|
||||
console.log(` ${tag} [${f.severity}] ${f.summary}`);
|
||||
if (f.summary.includes("scrum-master")) {
|
||||
for (const e of f.evidence) {
|
||||
console.log(` → ${e.slice(0, 200)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const scrumFindings = verdict.findings.filter(f => f.summary.includes("scrum-master"));
|
||||
console.log("");
|
||||
console.log(`═══ SCRUM WIRE CHECK: ${scrumFindings.length} scrum-master findings surfaced by kb_query ═══`);
|
||||
if (scrumFindings.length === 0) {
|
||||
console.log(" (none — either no matching scrum_reviews.jsonl rows, or files didn't match PR diff)");
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[audit_one] fatal:", e); process.exit(1); });
|
||||
@ -13,56 +13,17 @@
|
||||
// with a 15KB diff + claim list).
|
||||
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
import { Glob } from "bun";
|
||||
import { readFile, mkdir, appendFile } from "node:fs/promises";
|
||||
import { extractFacts } from "../fact_extractor.ts";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
// Rebuild 2026-04-26: route claim verification through /v1/mode/execute
|
||||
// (task_class=pr_audit) so we get pathway memory + lakehouse_answers_v1
|
||||
// + JSON-shaped framing molded into ONE prompt. The hand-rolled
|
||||
// systemMsg/userMsg path was reinventing the mode runner badly.
|
||||
//
|
||||
// 2026-04-27 update: original default kimi-k2:1t hit a sustained
|
||||
// upstream outage on Ollama Cloud (consistent 500 ISE across hours of
|
||||
// retries — verified with trivial 8-token probes). Swapped default to
|
||||
// deepseek-v3.1:671b which is proven working end-to-end through the
|
||||
// pr_audit mode runner during Phase 5 distillation acceptance testing.
|
||||
// kimi-k2:1t can be re-selected via LH_AUDITOR_REVIEW_MODEL env when
|
||||
// the upstream returns. Tie-breaker stays grok-4.1-fast (different
|
||||
// vendor lineage so consensus + tie-break won't fail-correlate).
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "deepseek-v3.1:671b";
|
||||
const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "x-ai/grok-4.1-fast";
|
||||
const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
|
||||
const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
|
||||
// 40KB comfortably fits the consensus models' context windows
|
||||
// (deepseek-v3.1 64K, gpt-oss-120b 128K). When the raw PR diff
|
||||
// exceeds this, we truncate and signal it via curationNote — the
|
||||
// pr_audit mode runner's matrix retrieval (lakehouse_answers_v1 +
|
||||
// arch + symbols) supplies the cross-PR context that tree-split
|
||||
// used to synthesize from scratch. Tree-split itself was retired
|
||||
// 2026-04-27 (see commit deleting treeSplitDiff/callCloud/SHARD_*).
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
||||
// previously truncated at 15KB causing the reviewer to miss later
|
||||
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
||||
// block finding when the file was simply outside the truncation window.
|
||||
const MAX_DIFF_CHARS = 40000;
|
||||
const CALL_TIMEOUT_MS = 120_000;
|
||||
// Mode runner can take longer than a raw /v1/chat call because it does
|
||||
// pathway-fingerprint lookup + matrix retrieval + relevance filter
|
||||
// before the LLM call. Budget extra time so we don't trip on a slow
|
||||
// answers-corpus search.
|
||||
const MODE_RUNNER_TIMEOUT_MS = 240_000;
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
|
||||
export interface InferenceContext {
|
||||
pr_number: number;
|
||||
head_sha: string;
|
||||
}
|
||||
|
||||
const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
|
||||
|
||||
export async function runInferenceCheck(
|
||||
claims: Claim[],
|
||||
diff: string,
|
||||
ctx?: InferenceContext,
|
||||
): Promise<Finding[]> {
|
||||
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
||||
if (claims.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
@ -72,38 +33,9 @@ export async function runInferenceCheck(
|
||||
}];
|
||||
}
|
||||
|
||||
// Empirical claims (runtime metrics / observed outcomes) can't be
|
||||
// verified from the diff. Drop them from the cloud prompt so the
|
||||
// reviewer doesn't chase ghosts. A future `runtime_evidence` check
|
||||
// can validate these against data/_kb/*/summary.json outputs.
|
||||
const verifiable = claims.filter(c => c.strength !== "empirical");
|
||||
const empiricalCount = claims.length - verifiable.length;
|
||||
if (verifiable.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
|
||||
evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
|
||||
}];
|
||||
}
|
||||
|
||||
// 2026-04-27 architecture simplification: dropped the tree-split
|
||||
// scratchpad layer. Rationale: the mode runner's pr_audit pipeline
|
||||
// pulls from lakehouse_answers_v1 (gold-standard prior audits) +
|
||||
// lakehouse_arch_v1 + lakehouse_symbols_v1 via matrix retrieval. That
|
||||
// corpus IS the cross-PR context the tree-split was synthesizing
|
||||
// from scratch on every audit run. With the distillation substrate
|
||||
// shipped (commits 27b1d27..1b433a9), per-shard fact extraction is
|
||||
// redundant — and gpt-oss:120b at 168 calls/audit was the dominant
|
||||
// cost. Now: truncate diff to MAX_DIFF_CHARS, hand straight to the
|
||||
// mode runner, let retrieval supply context. ONE strong-model call
|
||||
// per consensus rep × N=3 reps = 3 calls total per audit.
|
||||
const truncated = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated — the pr_audit mode runner has matrix retrieval against lakehouse_answers_v1 + arch + symbols for cross-PR context]`
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
||||
: diff;
|
||||
const curationNote = diff.length > MAX_DIFF_CHARS
|
||||
? ` (truncated ${diff.length}→${MAX_DIFF_CHARS} chars; matrix retrieval supplies cross-PR context)`
|
||||
: "";
|
||||
|
||||
// Build the reviewer prompt in the same shape as run_codereview's
|
||||
// review stage (llm_team_ui.py:10950), adapted for claim verification:
|
||||
@ -111,148 +43,118 @@ export async function runInferenceCheck(
|
||||
// "Code: ..."
|
||||
// "Review: bugs/security/perf/style/edge. Provide corrected code."
|
||||
// We add: claim list upfront + ask for structured JSON verdict.
|
||||
//
|
||||
// Curation flag is now just a truncation flag — when the diff was
|
||||
// cut, tell the reviewer it didn't see the full picture so it doesn't
|
||||
// confidently mark a claim NOT BACKED based on absence in the
|
||||
// (potentially incomplete) input.
|
||||
const isCurated = curationNote.length > 0;
|
||||
const prNumber = ctx?.pr_number ?? 0;
|
||||
const systemMsg = [
|
||||
"You review pull-request diffs against the author's own ship-claims.",
|
||||
"For each claim, decide: is it backed by actual code in the diff, or is",
|
||||
"it placeholder / aspirational / unwired?",
|
||||
"",
|
||||
"A claim is BACKED when the diff contains a real code path that delivers",
|
||||
"the claimed behavior. A claim is NOT BACKED when:",
|
||||
" - the claim asserts functionality but the diff only adds types/fields",
|
||||
" with no consumer",
|
||||
" - the claim mentions tests but no test function was added",
|
||||
" - the claim claims integration but the integration point is a stub",
|
||||
" - the diff contains unimplemented!() / todo!() / TODO comments",
|
||||
" - the claim says 'works end-to-end' but the diff has no end-to-end test",
|
||||
"",
|
||||
"Respond with strict JSON only. No prose before or after. Shape:",
|
||||
"{",
|
||||
' "claim_verdicts": [',
|
||||
' {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
|
||||
" ],",
|
||||
' "unflagged_gaps": [',
|
||||
' {"location": "file:line", "summary": "short description"}',
|
||||
" ]",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
// N=3 consensus — fire the mode runner three times in parallel.
|
||||
// Each /v1/mode/execute call composes pathway memory + answers corpus
|
||||
// + JSON-shaped pr_audit framing internally, so the auditor's only
|
||||
// job here is to vote-aggregate. Wall-clock ~= single call.
|
||||
const primaryRuns = await Promise.all(
|
||||
Array.from({ length: N_CONSENSUS }, () =>
|
||||
runModeRunnerInference(truncated, verifiable, prNumber, isCurated, MODEL)),
|
||||
);
|
||||
const userMsg = [
|
||||
`Ship-claims the author made (numbered 0..N-1):`,
|
||||
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
"",
|
||||
`Diff:`,
|
||||
"```",
|
||||
truncated,
|
||||
"```",
|
||||
"",
|
||||
`For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
|
||||
`author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
|
||||
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
||||
].join("\n");
|
||||
|
||||
const parsedRuns = primaryRuns.filter(r => r.parsed !== null);
|
||||
if (parsedRuns.length === 0) {
|
||||
// All N calls failed. Surface the first-run diagnostic so the
|
||||
// operator sees *why* (unreachable / non-200 / unparseable).
|
||||
const first = primaryRuns[0];
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: "ollama_cloud",
|
||||
model: MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: systemMsg },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
max_tokens: 3000,
|
||||
temperature: 0.2,
|
||||
think: true, // T3 overseer should reason — JSON shape is still required
|
||||
}),
|
||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
||||
});
|
||||
} catch (e) {
|
||||
// Cloud unreachable → soft-fail. Don't block a PR because the
|
||||
// reviewer model is down. Static + dynamic + kb still run.
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud inference all ${N_CONSENSUS} consensus runs failed — ${first.error ?? "unknown"}`,
|
||||
summary: "cloud inference unreachable — skipped",
|
||||
evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud inference returned ${resp.status} — skipped`,
|
||||
evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
const body: any = await resp.json();
|
||||
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
||||
const usage = body?.usage ?? {};
|
||||
|
||||
const parsed = extractJson(content);
|
||||
if (!parsed) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "cloud returned unparseable output — skipped",
|
||||
evidence: [
|
||||
`first-run diagnostic: ${first.diagnostic ?? "(none)"}`,
|
||||
`successful runs: 0 / ${N_CONSENSUS}`,
|
||||
`head: ${content.slice(0, 200)}`,
|
||||
`tokens: ${usage.total_tokens ?? "?"}`,
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
// Aggregate votes per claim_idx.
|
||||
interface Votes { trues: number; falses: number; evidences: string[] }
|
||||
const votesByClaim = new Map<number, Votes>();
|
||||
const unflaggedByRun: any[][] = [];
|
||||
// The N=3 consensus calls run via Promise.all — wall-clock is
|
||||
// bounded by the SLOWEST call, not the sum. Pre-2026-04-27 we
|
||||
// summed and reported "Xms total" which double/triple-counted
|
||||
// (Opus self-audit caught it). Use max for accurate wall-clock.
|
||||
let maxLatencyMs = 0;
|
||||
let totalEnrichedChars = 0;
|
||||
let bugFingerprintsSeen = 0;
|
||||
let matrixKeptSeen = 0;
|
||||
for (const run of parsedRuns) {
|
||||
maxLatencyMs = Math.max(maxLatencyMs, run.latency_ms ?? 0);
|
||||
totalEnrichedChars += run.enriched_chars ?? 0;
|
||||
bugFingerprintsSeen = Math.max(bugFingerprintsSeen, run.bug_fingerprints ?? 0);
|
||||
matrixKeptSeen = Math.max(matrixKeptSeen, run.matrix_kept ?? 0);
|
||||
unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []);
|
||||
for (const v of run.parsed?.claim_verdicts ?? []) {
|
||||
const idx = Number(v?.claim_idx);
|
||||
if (!Number.isFinite(idx)) continue;
|
||||
const rec = votesByClaim.get(idx) ?? { trues: 0, falses: 0, evidences: [] };
|
||||
if (v.backed === false) {
|
||||
rec.falses++;
|
||||
rec.evidences.push(String(v.evidence ?? ""));
|
||||
} else if (v.backed === true) {
|
||||
rec.trues++;
|
||||
}
|
||||
votesByClaim.set(idx, rec);
|
||||
}
|
||||
}
|
||||
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// Summary finding so the verdict layer knows the check ran.
|
||||
// One summary info finding so the verdict layer knows the check ran.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `pr_audit mode runner completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, ${maxLatencyMs}ms wall-clock)${curationNote}`,
|
||||
summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
|
||||
evidence: [
|
||||
`claims voted: ${votesByClaim.size}`,
|
||||
`parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`,
|
||||
`enrichment: ${bugFingerprintsSeen} bug fingerprints, ${matrixKeptSeen} answers-corpus chunks, prompt avg ${Math.round(totalEnrichedChars / Math.max(parsedRuns.length, 1))} chars`,
|
||||
`claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
|
||||
],
|
||||
});
|
||||
|
||||
// Per-claim majority vote; tie-break if no majority.
|
||||
const discrepancies: Array<{
|
||||
claim_idx: number;
|
||||
claim_text: string;
|
||||
votes: { trues: number; falses: number };
|
||||
resolution: "majority_backed" | "majority_not_backed" | "tiebreaker_backed" | "tiebreaker_not_backed" | "unresolved";
|
||||
tiebreaker_model?: string;
|
||||
}> = [];
|
||||
|
||||
for (const [idx, votes] of votesByClaim) {
|
||||
const claim = verifiable[idx];
|
||||
if (!claim) continue;
|
||||
const totalVotes = votes.trues + votes.falses;
|
||||
let notBacked: boolean | null = null;
|
||||
let resolution: typeof discrepancies[number]["resolution"] = "majority_backed";
|
||||
let evidenceText = "";
|
||||
let tbModel: string | undefined;
|
||||
|
||||
if (votes.falses > votes.trues) {
|
||||
notBacked = true;
|
||||
resolution = "majority_not_backed";
|
||||
evidenceText = votes.evidences[0] ?? "(no reason given)";
|
||||
} else if (votes.trues > votes.falses) {
|
||||
notBacked = false;
|
||||
resolution = "majority_backed";
|
||||
} else {
|
||||
// Tie. Run tie-breaker with a different-architecture model
|
||||
// through the same mode runner so framing/enrichment match.
|
||||
const tb = await runModeRunnerInference(truncated, verifiable, prNumber, isCurated, TIEBREAKER_MODEL);
|
||||
if (tb.parsed) {
|
||||
const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx);
|
||||
if (tv?.backed === false) {
|
||||
notBacked = true;
|
||||
resolution = "tiebreaker_not_backed";
|
||||
evidenceText = `(tie-breaker ${TIEBREAKER_MODEL}) ${String(tv.evidence ?? "")}`;
|
||||
tbModel = TIEBREAKER_MODEL;
|
||||
} else if (tv?.backed === true) {
|
||||
notBacked = false;
|
||||
resolution = "tiebreaker_backed";
|
||||
tbModel = TIEBREAKER_MODEL;
|
||||
} else {
|
||||
resolution = "unresolved";
|
||||
}
|
||||
} else {
|
||||
resolution = "unresolved";
|
||||
}
|
||||
}
|
||||
|
||||
// Log every case where the N runs disagreed — discrepancies are
|
||||
// signal, not noise. Separate from audit_lessons.jsonl because
|
||||
// they're about the *auditor's* quality, not the PR's quality.
|
||||
const disagreed = totalVotes >= 2 && votes.trues > 0 && votes.falses > 0;
|
||||
if (disagreed || resolution.startsWith("tiebreaker") || resolution === "unresolved") {
|
||||
discrepancies.push({
|
||||
claim_idx: idx,
|
||||
claim_text: claim.text,
|
||||
votes: { trues: votes.trues, falses: votes.falses },
|
||||
resolution,
|
||||
tiebreaker_model: tbModel,
|
||||
});
|
||||
}
|
||||
|
||||
if (notBacked === true) {
|
||||
for (const v of parsed.claim_verdicts ?? []) {
|
||||
if (v?.backed === false) {
|
||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
||||
const claim = claims[idx];
|
||||
if (!claim) continue;
|
||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||
: claim.strength === "moderate" ? "warn"
|
||||
: "info";
|
||||
@ -263,300 +165,24 @@ export async function runInferenceCheck(
|
||||
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
||||
evidence: [
|
||||
`at ${claim.location}`,
|
||||
`consensus: ${votes.falses}/${totalVotes} not-backed (resolution: ${resolution})`,
|
||||
`cloud reason: ${evidenceText.slice(0, 200)}`,
|
||||
`cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Persist discrepancies so we can measure consensus drift over time.
|
||||
if (discrepancies.length > 0 && ctx) {
|
||||
persistDiscrepancies(ctx, discrepancies).catch(e =>
|
||||
console.error(`[inference] discrepancy log failed: ${(e as Error).message}`));
|
||||
}
|
||||
|
||||
// Use first run's parsed for downstream unflagged_gaps processing.
|
||||
const parsed = parsedRuns[0].parsed;
|
||||
|
||||
// Route the curated scratchpad through llm_team's extract-facts
|
||||
// pipeline when we have (a) a curated scratchpad (best signal about
|
||||
// what the PR actually changed) and (b) PR context to scope facts.
|
||||
// AWAITED (not fire-and-forget) so CLI callers like audit_one.ts
|
||||
// don't exit before extraction lands; the systemd poller has plenty
|
||||
// of headroom (90s cycle vs ~15s extraction). A failure inside
|
||||
// extractAndPersistFacts is caught + logged but never throws.
|
||||
// Post-2026-04-27: extraction now runs against the truncated diff
|
||||
// (no scratchpad to extract from since tree-split was retired).
|
||||
// Fact extraction is still useful for surfacing entities/symbols
|
||||
// into audit_facts.jsonl even from truncated input.
|
||||
if (isCurated && ctx && process.env.LH_AUDITOR_SKIP_EXTRACT !== "1") {
|
||||
try {
|
||||
await extractAndPersistFacts(truncated, ctx);
|
||||
} catch (e) {
|
||||
console.error(`[inference] fact extraction failed: ${(e as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Belt-and-suspenders: when operating on a curated scratchpad, drop
|
||||
// the unflagged_gaps section entirely. The distillation can't
|
||||
// reliably ground gap-detection, and false positives are worse than
|
||||
// misses for this signal class. The systemMsg already asks the
|
||||
// cloud to skip this section when curated — but the model may still
|
||||
// emit it, so we filter here too.
|
||||
const gapsToEmit = isCurated ? [] : (parsed.unflagged_gaps ?? []);
|
||||
for (const g of gapsToEmit) {
|
||||
const summary = String(g?.summary ?? "?");
|
||||
const location = String(g?.location ?? "?");
|
||||
// False-positive guard — when the cloud says "X not defined in this
|
||||
// diff" or "missing implementation of X", the cloud may just mean
|
||||
// "X is not in the added lines," not "X doesn't exist in the repo."
|
||||
// Extract candidate symbol names and grep the repo. If any symbol
|
||||
// is defined elsewhere, drop the finding — it's a known-symbol
|
||||
// reference, not a placeholder.
|
||||
if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
|
||||
const symbols = extractSymbols(summary);
|
||||
if (symbols.length > 0) {
|
||||
const resolved = await symbolsExistInRepo(symbols);
|
||||
if (resolved.length === symbols.length) {
|
||||
// Every named symbol exists somewhere in the repo — silent drop.
|
||||
continue;
|
||||
}
|
||||
if (resolved.length > 0) {
|
||||
// Partially resolved — demote to info with a note.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
|
||||
evidence: [
|
||||
`location: ${location.slice(0, 140)}`,
|
||||
`resolved via grep: ${resolved.join(",")}`,
|
||||
`unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
|
||||
],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const g of parsed.unflagged_gaps ?? []) {
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "warn",
|
||||
summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
|
||||
evidence: [`location: ${location.slice(0, 140)}`],
|
||||
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
|
||||
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Single mode-runner call — consensus + tie-breaker dispatch through
|
||||
// here. Returns parsed JSON shape + telemetry from /v1/mode/execute
|
||||
// (latency, enrichment metrics) + any error diagnostic. NEVER throws.
|
||||
// The consensus aggregator handles partial failures by dropping
|
||||
// non-parsed runs from the vote.
|
||||
interface CloudRunResult {
|
||||
parsed: any | null;
|
||||
latency_ms: number;
|
||||
enriched_chars: number;
|
||||
bug_fingerprints: number;
|
||||
matrix_kept: number;
|
||||
error?: string; // "unreachable" | "non_200" | "unparseable"
|
||||
diagnostic?: string; // first 200 chars for debugging
|
||||
model: string;
|
||||
}
|
||||
|
||||
async function runModeRunnerInference(
|
||||
diffOrScratchpad: string,
|
||||
claims: Claim[],
|
||||
prNumber: number,
|
||||
isCurated: boolean,
|
||||
model: string,
|
||||
): Promise<CloudRunResult> {
|
||||
// user_question carries the claim list + the curation note (if any).
|
||||
// pr_audit's framing (mode.rs FRAMING_PR_AUDIT) holds the JSON shape +
|
||||
// strict-output rules so we don't repeat them here.
|
||||
const claimDigest = claims
|
||||
.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`)
|
||||
.join("\n");
|
||||
const curationNote = isCurated
|
||||
? "\n\nNOTE: the FILE below is a curated multi-shard scratchpad of the diff, not the raw diff itself. Absence in the scratchpad is NOT evidence of absence in the actual diff. Only mark backed=false on direct contradiction (e.g. scratchpad shows the function is empty / a stub). Skip unflagged_gaps entirely when scratchpad is curated."
|
||||
: "";
|
||||
const userQuestion = [
|
||||
"Verify each ship-claim against the diff (or scratchpad).",
|
||||
"",
|
||||
"Ship-claims (numbered 0..N-1):",
|
||||
claimDigest,
|
||||
curationNote,
|
||||
"",
|
||||
"Every claim above must produce exactly one claim_verdicts entry. Output strict JSON only — no prose outside the JSON object.",
|
||||
].join("\n");
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: "pr_audit",
|
||||
file_path: `pr-${prNumber}.diff`,
|
||||
file_content: diffOrScratchpad,
|
||||
user_question: userQuestion,
|
||||
force_model: model,
|
||||
force_temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(MODE_RUNNER_TIMEOUT_MS),
|
||||
});
|
||||
} catch (e) {
|
||||
return {
|
||||
parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0,
|
||||
error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model,
|
||||
};
|
||||
}
|
||||
if (!resp.ok) {
|
||||
return {
|
||||
parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0,
|
||||
error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model,
|
||||
};
|
||||
}
|
||||
let body: any;
|
||||
try { body = await resp.json(); }
|
||||
catch (e) {
|
||||
return {
|
||||
parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0,
|
||||
error: "unparseable", diagnostic: (e as Error).message, model,
|
||||
};
|
||||
}
|
||||
const content: string = typeof body?.response === "string" ? body.response : "";
|
||||
const parsed = extractJson(content);
|
||||
// Number-coerced extractors so a non-numeric upstream value (string,
|
||||
// null, NaN) collapses to 0 instead of poisoning downstream
|
||||
// arithmetic. Caught 2026-04-27 by kimi_architect self-audit —
|
||||
// optional-chaining + ?? only catches null/undefined, not type drift.
|
||||
const num = (v: unknown): number => {
|
||||
const n = typeof v === "number" ? v : Number(v);
|
||||
return Number.isFinite(n) ? n : 0;
|
||||
};
|
||||
return {
|
||||
parsed,
|
||||
latency_ms: num(body?.latency_ms),
|
||||
enriched_chars: num(body?.enriched_prompt_chars),
|
||||
bug_fingerprints: num(body?.sources?.bug_fingerprints_count),
|
||||
matrix_kept: num(body?.sources?.matrix_chunks_kept),
|
||||
error: parsed ? undefined : "unparseable",
|
||||
diagnostic: parsed ? undefined : content.slice(0, 200),
|
||||
model,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
|
||||
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
||||
const rows = discrepancies.map(d => JSON.stringify({
|
||||
pr_number: ctx.pr_number,
|
||||
head_sha: ctx.head_sha,
|
||||
logged_at: new Date().toISOString(),
|
||||
...d,
|
||||
}));
|
||||
await appendFile(AUDIT_DISCREPANCIES_JSONL, rows.join("\n") + "\n");
|
||||
}
|
||||
|
||||
// Extract structured knowledge from the curated scratchpad and append
|
||||
// to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
|
||||
// PR number + head SHA for scope tracking. kb_query tails this next
|
||||
// audit to surface recurring entities/relationships across PRs.
|
||||
async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext): Promise<void> {
|
||||
const ex = await extractFacts(scratchpad);
|
||||
if (ex.error && ex.entities.length === 0 && ex.facts.length === 0) {
|
||||
// Full failure — log but don't write an empty row.
|
||||
console.error(`[inference] extractFacts skipped row: ${ex.error}`);
|
||||
return;
|
||||
}
|
||||
const row = {
|
||||
pr_number: ctx.pr_number,
|
||||
head_sha: ctx.head_sha,
|
||||
extracted_at: ex.extracted_at,
|
||||
extractor: ex.extractor_model,
|
||||
verifier: ex.verifier_model,
|
||||
llm_team_run_id: ex.llm_team_run_id ?? null,
|
||||
facts: ex.facts,
|
||||
entities: ex.entities,
|
||||
relationships: ex.relationships,
|
||||
verification_preview: ex.verification.slice(0, 400),
|
||||
verifier_verdicts: ex.verifier_verdicts,
|
||||
facts_dropped_by_verifier: ex.facts_dropped_by_verifier ?? 0,
|
||||
schema_version: 2,
|
||||
source: "audit_inference",
|
||||
};
|
||||
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
||||
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Pull out plausible code-symbol names from a summary string.
|
||||
// Matches:
|
||||
// - identifier with backticks: `foo_bar`
|
||||
// - identifier followed by parens: foo_bar()
|
||||
// - CamelCase types
|
||||
// - snake_case_functions
|
||||
// Filters out common English words that could be matched accidentally.
|
||||
const STOPWORDS = new Set([
|
||||
"not","the","and","for","this","that","with","but","are","was","has",
|
||||
"have","been","any","missing","implementation","diff","defined","never",
|
||||
"referenced","integrated","flow","code","file","some","only","when",
|
||||
]);
|
||||
function extractSymbols(text: string): string[] {
|
||||
const out = new Set<string>();
|
||||
// `backticked` symbols
|
||||
for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
|
||||
// foo() or foo_bar() calls
|
||||
for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
|
||||
// CamelCase types (3+ chars, must start with uppercase)
|
||||
for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
|
||||
return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
|
||||
}
|
||||
|
||||
// Scan the repo for at least one definition of each symbol. Uses Bun's
|
||||
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
|
||||
// and target/. Skips files > 500KB — those are fixtures/snapshots that
|
||||
// won't contain a definition line and slurping them slows the audit.
|
||||
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
|
||||
const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
|
||||
const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
|
||||
const MAX_FILE_BYTES = 500_000;
|
||||
const { stat } = await import("node:fs/promises");
|
||||
const resolved = new Set<string>();
|
||||
const toFind = new Set(symbols);
|
||||
for (const pat of patterns) {
|
||||
if (toFind.size === 0) break;
|
||||
const glob = new Glob(pat);
|
||||
for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
|
||||
if (skip(f)) continue;
|
||||
try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; }
|
||||
let content: string;
|
||||
try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
|
||||
for (const sym of Array.from(toFind)) {
|
||||
// Definition heuristics: `function sym`, `fn sym`, `const sym`,
|
||||
// `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
|
||||
// `trait sym`, `async function sym`, `pub (async )?fn sym`.
|
||||
const re = new RegExp(
|
||||
`\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
|
||||
);
|
||||
if (re.test(content)) {
|
||||
resolved.add(sym);
|
||||
toFind.delete(sym);
|
||||
if (toFind.size === 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Array.from(resolved);
|
||||
}
|
||||
|
||||
function escapeRe(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of the response. Tolerates
|
||||
// leading prose, code fences, and model reasoning preamble when the
|
||||
// cloud model ignored "strict JSON only."
|
||||
|
||||
@ -8,7 +8,6 @@
|
||||
// What this check reads (all file-backed, append-only or periodic):
|
||||
// data/_kb/outcomes.jsonl — per-scenario outcomes (kb.ts)
|
||||
// data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig
|
||||
// data/_kb/scrum_reviews.jsonl — scrum-master accepted reviews
|
||||
// data/_observer/ops.jsonl — observer ring → disk stream
|
||||
// data/_bot/cycles/*.json — bot cycle results
|
||||
//
|
||||
@ -18,18 +17,14 @@
|
||||
import { readFile, readdir, stat } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
import { aggregate, ratingSeverity, formatAgg } from "../kb_index.ts";
|
||||
|
||||
const KB_DIR = "/home/profit/lakehouse/data/_kb";
|
||||
const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
|
||||
const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
|
||||
const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
|
||||
const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
|
||||
const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
|
||||
const TAIL_LINES = 500;
|
||||
const MAX_BOT_CYCLE_FILES = 30;
|
||||
|
||||
export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promise<Finding[]> {
|
||||
export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// 1. Recent scenario outcomes: are strong-claim-style phrases showing
|
||||
@ -53,35 +48,6 @@ export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promi
|
||||
const obsFindings = await checkObserverStream();
|
||||
findings.push(...obsFindings);
|
||||
|
||||
// 5. Scrum-master reviews — surface prior accepted reviews for any
|
||||
// file in this PR's diff. Cohesion plan Phase C wire: the
|
||||
// auditor gets to "borrow" the scrum-master's deeper per-file
|
||||
// analysis instead of re-doing that work.
|
||||
if (prFiles.length > 0) {
|
||||
const scrumFindings = await checkScrumReviews(prFiles);
|
||||
findings.push(...scrumFindings);
|
||||
}
|
||||
|
||||
// 6b. Audit-facts (llm_team extract pipeline output) — surface
|
||||
// entities that recur across multiple PRs. These are the
|
||||
// "core system entities" accumulating in the knowledge base;
|
||||
// showing them as info on future audits gives reviewers
|
||||
// architectural context the raw diff doesn't convey.
|
||||
const factFindings = await checkAuditFacts();
|
||||
findings.push(...factFindings);
|
||||
|
||||
// 6. Audit-lessons feedback loop — summarize the top recurring
|
||||
// patterns from prior audits' block/warn findings. If the same
|
||||
// pattern signature has fired 3+ times across prior audits,
|
||||
// emit it as a block-severity finding so reviewers know this
|
||||
// is a known-recurring class, not a one-off. Does NOT couple
|
||||
// to the current audit's static/inference findings (those run
|
||||
// in parallel and we can't see them here) — the amplification
|
||||
// is emergent: if the current audit's finding-summary matches
|
||||
// a top recurrence, the reviewer sees both.
|
||||
const auditLessonFindings = await checkAuditLessons();
|
||||
findings.push(...auditLessonFindings);
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
@ -215,183 +181,3 @@ function observerBySource(ops: any[]): string {
|
||||
}
|
||||
return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
|
||||
}
|
||||
|
||||
// Audit-facts — reads data/_kb/audit_facts.jsonl (populated by every
|
||||
// curated inference run via llm_team's extract pipeline). Each row
|
||||
// has arrays: facts, entities, relationships. We explode entities and
|
||||
// aggregate them across PRs using kb_index. An entity seen in 3+ PRs
|
||||
// is a "core system entity" — we surface the top N as info context.
|
||||
//
|
||||
// Filters out short names (<3 chars, likely qwen2.5 truncation
|
||||
// artifacts) and generic types ("string", "number") that would
|
||||
// otherwise dominate the ranking.
|
||||
const ENTITY_NAME_MIN_LEN = 3;
|
||||
const GENERIC_ENTITY_NAMES = new Set([
|
||||
"string", "number", "boolean", "any", "void", "unknown", "never",
|
||||
"object", "array", "function", "const", "let", "var", "true", "false",
|
||||
"null", "undefined", "promise", "map", "set", "record",
|
||||
]);
|
||||
|
||||
async function checkAuditFacts(): Promise<Finding[]> {
|
||||
// Read raw rows — each row has multiple entities, so we can't just
|
||||
// use aggregate() directly (it's one-signature-per-row). Explode
|
||||
// entities into (row, entity) pairs, then aggregate by entity name.
|
||||
let raw: string;
|
||||
try { raw = await (await import("node:fs/promises")).readFile(AUDIT_FACTS_JSONL, "utf8"); }
|
||||
catch { return []; }
|
||||
const lines = raw.split("\n").filter(l => l.length > 0);
|
||||
if (lines.length === 0) return [];
|
||||
|
||||
interface EntityRow { entity_key: string; pr_number: number; type: string; name: string; description: string }
|
||||
const entityRows: EntityRow[] = [];
|
||||
for (const line of lines.slice(-TAIL_LINES * 2)) {
|
||||
let row: any;
|
||||
try { row = JSON.parse(line); } catch { continue; }
|
||||
const prNum = Number(row?.pr_number);
|
||||
if (!Number.isFinite(prNum)) continue;
|
||||
for (const e of Array.isArray(row?.entities) ? row.entities : []) {
|
||||
const name = String(e?.name ?? "").trim();
|
||||
if (name.length < ENTITY_NAME_MIN_LEN) continue;
|
||||
if (GENERIC_ENTITY_NAMES.has(name.toLowerCase())) continue;
|
||||
entityRows.push({
|
||||
entity_key: name.toLowerCase(),
|
||||
pr_number: prNum,
|
||||
type: String(e?.type ?? "?"),
|
||||
name,
|
||||
description: String(e?.description ?? "").slice(0, 160),
|
||||
});
|
||||
}
|
||||
}
|
||||
if (entityRows.length === 0) return [];
|
||||
|
||||
// Aggregate manually — one key per entity name, distinct_scopes by PR.
|
||||
type Agg = { count: number; scopes: Set<number>; types: Set<string>; last_name: string; last_desc: string };
|
||||
const byEntity = new Map<string, Agg>();
|
||||
for (const r of entityRows) {
|
||||
const a = byEntity.get(r.entity_key) ?? {
|
||||
count: 0, scopes: new Set<number>(), types: new Set<string>(), last_name: "", last_desc: "",
|
||||
};
|
||||
a.count += 1;
|
||||
a.scopes.add(r.pr_number);
|
||||
a.types.add(r.type);
|
||||
a.last_name = r.name;
|
||||
a.last_desc = r.description;
|
||||
byEntity.set(r.entity_key, a);
|
||||
}
|
||||
|
||||
// Rank: require 2+ distinct PRs (same-PR entity-repeats don't count
|
||||
// as "cross-cutting"). Take the top 5 to avoid flooding the verdict.
|
||||
const ranked = Array.from(byEntity.entries())
|
||||
.filter(([_, a]) => a.scopes.size >= 2)
|
||||
.sort((a, b) => b[1].scopes.size - a[1].scopes.size || b[1].count - a[1].count)
|
||||
.slice(0, 5);
|
||||
|
||||
if (ranked.length === 0) {
|
||||
// Useful to know the KB is being populated — emit a single
|
||||
// summary so operators see fact extraction is alive.
|
||||
return [{
|
||||
check: "kb_query",
|
||||
severity: "info",
|
||||
summary: `audit_facts KB has ${entityRows.length} entity-observations across ${new Set(entityRows.map(r => r.pr_number)).size} PRs (no cross-PR recurrences yet)`,
|
||||
evidence: [`source: ${AUDIT_FACTS_JSONL}`],
|
||||
}];
|
||||
}
|
||||
|
||||
return ranked.map(([_, a]) => ({
|
||||
check: "kb_query" as const,
|
||||
severity: "info" as const,
|
||||
summary: `core entity \`${a.last_name}\` recurs in ${a.scopes.size} PRs (types: ${Array.from(a.types).join(",")})`,
|
||||
evidence: [
|
||||
`count=${a.count} distinct_PRs=${a.scopes.size}`,
|
||||
`description: ${a.last_desc.slice(0, 200)}`,
|
||||
`PRs: ${Array.from(a.scopes).sort((x, y) => x - y).join(",")}`,
|
||||
],
|
||||
}));
|
||||
}
|
||||
|
||||
// Audit-lessons — reads data/_kb/audit_lessons.jsonl (populated by
|
||||
// every audit's appendAuditLessons). Uses the shared kb_index
|
||||
// aggregator: groups by `signature`, distinct-scopes keyed by PR
|
||||
// number, severity from ratingSeverity(agg) which applies the
|
||||
// confidence × count rating (see kb_index.ts). This is the same
|
||||
// aggregation any other KB reader uses — shared discipline, not
|
||||
// per-check custom logic.
|
||||
async function checkAuditLessons(): Promise<Finding[]> {
|
||||
const bySig = await aggregate<any>(AUDIT_LESSONS_JSONL, {
|
||||
keyFn: (r) => r?.signature,
|
||||
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||
checkFn: (r) => r?.check,
|
||||
tailLimit: TAIL_LINES * 4,
|
||||
});
|
||||
if (bySig.size === 0) return [];
|
||||
|
||||
const findings: Finding[] = [];
|
||||
for (const [sig, agg] of bySig) {
|
||||
// Silent on first-ever occurrence — not yet signal.
|
||||
if (agg.count < 2) continue;
|
||||
const sev = ratingSeverity(agg);
|
||||
findings.push({
|
||||
check: "kb_query",
|
||||
severity: sev,
|
||||
summary: `recurring audit pattern (${agg.distinct_scopes} distinct PRs, ${agg.count} flaggings, conf=${agg.confidence.toFixed(2)}): ${agg.representative_summary.slice(0, 160)}`,
|
||||
evidence: [
|
||||
`signature=${sig}`,
|
||||
`checks: ${agg.checks.join(",")}`,
|
||||
`scopes: ${agg.scopes.slice(-6).join(",")}`,
|
||||
formatAgg(agg),
|
||||
],
|
||||
});
|
||||
}
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Scrum-master reviews — the scrum pipeline writes one row per
|
||||
// accepted per-file review. We match reviews whose `file` matches
|
||||
// any path in the PR's diff, then surface the *preview* + which
|
||||
// model the escalation ladder had to reach. If the scrum-master
|
||||
// needed the 123B specialist or larger to resolve a file, that's
|
||||
// a meaningful signal about the code's complexity — and it's
|
||||
// surfaced to the PR without the auditor having to re-run the
|
||||
// escalation ladder itself.
|
||||
async function checkScrumReviews(prFiles: string[]): Promise<Finding[]> {
|
||||
const rows = await tailJsonl<any>(SCRUM_REVIEWS_JSONL, TAIL_LINES);
|
||||
if (rows.length === 0) return [];
|
||||
|
||||
// Match by exact file OR filename suffix — PR files arrive as
|
||||
// `auditor/audit.ts`-style relative paths; scrum stores the same.
|
||||
const norm = (p: string) => p.replace(/^\/+/, "").replace(/^home\/profit\/lakehouse\//, "");
|
||||
const prSet = new Set(prFiles.map(norm));
|
||||
|
||||
// Keep only the most recent review per file (last-wins).
|
||||
const latestByFile = new Map<string, any>();
|
||||
for (const r of rows) {
|
||||
const f = norm(String(r.file ?? ""));
|
||||
if (!f) continue;
|
||||
if (!prSet.has(f)) continue;
|
||||
latestByFile.set(f, r);
|
||||
}
|
||||
if (latestByFile.size === 0) return [];
|
||||
|
||||
const findings: Finding[] = [];
|
||||
for (const [file, r] of latestByFile) {
|
||||
const model = String(r.accepted_model ?? "?");
|
||||
const attempt = r.accepted_on_attempt ?? "?";
|
||||
const treeSplit = !!r.tree_split_fired;
|
||||
// Heuristic: if the scrum-master had to escalate past attempt 3,
|
||||
// or had to tree-split, that's context the PR reviewer should see.
|
||||
// Severity: info for low-escalation, warn if escalated far up
|
||||
// the ladder (cloud specialist required).
|
||||
const heavyEscalation = Number(attempt) >= 4;
|
||||
const sev: "warn" | "info" = heavyEscalation ? "warn" : "info";
|
||||
findings.push({
|
||||
check: "kb_query",
|
||||
severity: sev,
|
||||
summary: `scrum-master review for \`${file}\` — accepted on attempt ${attempt} by \`${model}\`${treeSplit ? " (tree-split)" : ""}`,
|
||||
evidence: [
|
||||
`reviewed_at: ${r.reviewed_at ?? "?"}`,
|
||||
`preview: ${String(r.suggestions_preview ?? "").slice(0, 300).replace(/\n/g, " ")}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
return findings;
|
||||
}
|
||||
|
||||
@ -1,461 +0,0 @@
|
||||
// Kimi-architect check — second-pass senior architectural review using
|
||||
// kimi-for-coding (Kimi K2.6) via /v1/chat provider=kimi.
|
||||
//
|
||||
// Runs AFTER the deepseek inference check (N=3 consensus) and the
|
||||
// static/kb_query checks. Reads their findings as context and asks Kimi
|
||||
// "what did everyone else miss?" — complementing the cheap-consensus
|
||||
// voting with a sparse senior pass that catches load-bearing issues
|
||||
// (compile errors, false telemetry, schema bypasses, etc.) which the
|
||||
// voting structure can't see.
|
||||
//
|
||||
// Why Kimi here and not in the inner inference loop:
|
||||
// - Cost: ~3min wall-clock per call vs ~30s for deepseek consensus.
|
||||
// - TOS: api.kimi.com is User-Agent-gated (see crates/gateway/src/v1/
|
||||
// kimi.rs); cost-bounded calls only.
|
||||
// - Value: experiment 2026-04-27 showed 7/7 grounding rate with full
|
||||
// files vs ~50% on truncated input. Best as a sparse complement, not
|
||||
// a replacement.
|
||||
//
|
||||
// Failure-isolated: any Kimi error returns a single info-level Finding
|
||||
// "kimi_architect skipped — <reason>" so the existing audit pipeline
|
||||
// is never blocked by a Kimi outage / TOS revocation / 429.
|
||||
//
|
||||
// Cost cap: if a kimi_verdicts/<pr>-<sha>.json file exists less than 24h
|
||||
// old, return cached findings without calling upstream. New commits
|
||||
// produce new SHAs so this is per-head, not per-day.
|
||||
//
|
||||
// Off by default: caller checks LH_AUDITOR_KIMI=1 before invoking.
|
||||
|
||||
import { readFile, writeFile, mkdir, appendFile, stat, realpath } from "node:fs/promises";
|
||||
import { existsSync, realpathSync } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
import type { Finding, CheckKind } from "../types.ts";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const KIMI_VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/kimi_verdicts";
|
||||
const KIMI_AUDITS_JSONL = "/home/profit/lakehouse/data/_kb/kimi_audits.jsonl";
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
// Canonicalize at module load — REPO_ROOT itself may be a symlink in
|
||||
// some environments (e.g. /home/profit is a bind-mount). Computing
|
||||
// once at startup means the per-finding grounding loop can compare
|
||||
// realpath(target) against this stable anchor.
|
||||
const REPO_ROOT_REAL = (() => {
|
||||
try { return realpathSync(REPO_ROOT); }
|
||||
catch { return REPO_ROOT; }
|
||||
})();
|
||||
// 15 min budget. Bun's fetch has an intrinsic ~300s limit that our
|
||||
// AbortController + setTimeout combo could not override; we use curl
|
||||
// via Bun.spawn instead (callKimi below). Curl honors -m for max
|
||||
// transfer time without a hard intrinsic ceiling.
|
||||
const CALL_TIMEOUT_MS = 900_000;
|
||||
const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
|
||||
const MAX_DIFF_CHARS = 180_000;
|
||||
const MAX_PRIOR_FINDINGS = 50;
|
||||
// Default provider/model = ollama_cloud/kimi-k2.6. Pre-2026-04-27 we
|
||||
// went direct to api.kimi.com, but Ollama Cloud Pro now exposes the
|
||||
// same model legitimately, so we route there to avoid User-Agent
|
||||
// gating. The api.kimi.com path (provider=kimi) remains wired in the
|
||||
// gateway as a fallback for when Ollama Cloud is upstream-broken.
|
||||
const KIMI_PROVIDER = process.env.LH_AUDITOR_KIMI_PROVIDER ?? "ollama_cloud";
|
||||
const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-k2.6";
|
||||
// Cross-lineage alternation. 2026-04-27 J's call: Opus is too
|
||||
// expensive to auto-fire (~$0.30/audit). Kimi K2.6 via Go-sub is
|
||||
// effectively free; Haiku 4.5 via Zen is ~$0.04. Alternate between
|
||||
// them so we get cross-lineage signal (Moonshot vs Anthropic) on
|
||||
// every PR's audit history without burning the budget.
|
||||
//
|
||||
// Default: Kimi K2.6 on even audits, Haiku 4.5 on odd. Each PR's
|
||||
// audits flip between vendors as new SHAs come in.
|
||||
//
|
||||
// Frontier models (Opus 4.7, GPT-5.5, Gemini 3.1) are NOT in the
|
||||
// auto path. Operator hands distilled findings to a frontier model
|
||||
// manually when high-leverage decisions need it. Removing Opus from
|
||||
// auto-promotion saves ~$1-3/day on the daemon at our cadence.
|
||||
//
|
||||
// Override the alternation entirely with LH_AUDITOR_KIMI_MODEL
|
||||
// (forces one model regardless of audit count); set
|
||||
// LH_AUDITOR_KIMI_ALT_MODEL to the alternate.
|
||||
const ALT_MODEL = process.env.LH_AUDITOR_KIMI_ALT_MODEL ?? "claude-haiku-4-5";
|
||||
const ALT_PROVIDER = process.env.LH_AUDITOR_KIMI_ALT_PROVIDER ?? "opencode";
|
||||
const FORCE_DEFAULT = process.env.LH_AUDITOR_KIMI_MODEL !== undefined && process.env.LH_AUDITOR_KIMI_MODEL !== "";
|
||||
|
||||
function selectModel(diffLen: number, auditIndex: number = 0): { provider: string; model: string; promoted: boolean } {
|
||||
// Operator override — env-pinned model wins.
|
||||
if (FORCE_DEFAULT) {
|
||||
return { provider: KIMI_PROVIDER, model: KIMI_MODEL, promoted: false };
|
||||
}
|
||||
// Alternate Kimi (default, even index) ↔ Haiku (alt, odd index).
|
||||
// diffLen kept in the signature for future "big diff → Haiku
|
||||
// anyway" logic; not used yet so we don't auto-burn on big PRs.
|
||||
void diffLen;
|
||||
if (auditIndex % 2 === 1) {
|
||||
return { provider: ALT_PROVIDER, model: ALT_MODEL, promoted: true };
|
||||
}
|
||||
return { provider: KIMI_PROVIDER, model: KIMI_MODEL, promoted: false };
|
||||
}
|
||||
// Model-aware max_tokens. Different upstream APIs cap at different
|
||||
// limits and reject requests that exceed them:
|
||||
// - Anthropic Opus 4.x: 32K output (with extended-output header)
|
||||
// - Anthropic Haiku 4.5: 8K output
|
||||
// - Kimi K2.6 (reasoning): 128K — needs headroom because
|
||||
// reasoning_content counts against the budget
|
||||
// - Default: 16K, conservative middle ground
|
||||
//
|
||||
// 2026-04-27 BLOCK from Opus self-audit: the prior single-default of
|
||||
// 128K worked silently (Anthropic clamps server-side) but was
|
||||
// technically invalid. Per-model caps make it explicit. Override via
|
||||
// LH_AUDITOR_KIMI_MAX_TOKENS to force a value (also fixes the empty-
|
||||
// env Number("") -> 0 trap by using `||` not `??`).
|
||||
const MAX_TOKENS_OVERRIDE = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 0;
|
||||
function maxTokensFor(model: string): number {
|
||||
if (MAX_TOKENS_OVERRIDE > 0) return MAX_TOKENS_OVERRIDE;
|
||||
if (model.startsWith("claude-opus")) return 32_000;
|
||||
if (model.startsWith("claude-haiku") || model.startsWith("claude-sonnet")) return 8_192;
|
||||
if (model.startsWith("kimi-")) return 128_000;
|
||||
if (model.startsWith("gpt-5") || model.startsWith("o1") || model.startsWith("o3") || model.startsWith("o4")) return 32_000;
|
||||
return 16_000;
|
||||
}
|
||||
|
||||
export interface KimiArchitectContext {
|
||||
pr_number: number;
|
||||
head_sha: string;
|
||||
}
|
||||
|
||||
interface KimiVerdictFile {
|
||||
pr_number: number;
|
||||
head_sha: string;
|
||||
cached_at: string;
|
||||
model: string;
|
||||
latency_ms: number;
|
||||
finish_reason: string;
|
||||
usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number };
|
||||
raw_content: string;
|
||||
findings: Finding[];
|
||||
grounding: { total: number; verified: number; rate: number };
|
||||
}
|
||||
|
||||
export async function runKimiArchitectCheck(
|
||||
diff: string,
|
||||
priorFindings: Finding[],
|
||||
ctx: KimiArchitectContext,
|
||||
): Promise<Finding[]> {
|
||||
const cachePath = join(KIMI_VERDICTS_DIR, `${ctx.pr_number}-${ctx.head_sha.slice(0, 12)}.json`);
|
||||
const outageSentinel = `${cachePath}.outage`;
|
||||
const OUTAGE_TTL_MS = 10 * 60 * 1000;
|
||||
|
||||
// Outage negative-cache — if upstream failed within OUTAGE_TTL_MS,
|
||||
// skip this audit and return immediately. Prevents the daemon from
|
||||
// hammering a downed Kimi/Anthropic upstream every 90s.
|
||||
if (existsSync(outageSentinel)) {
|
||||
try {
|
||||
const s = await stat(outageSentinel);
|
||||
if (Date.now() - s.mtimeMs < OUTAGE_TTL_MS) {
|
||||
const note = JSON.parse(await readFile(outageSentinel, "utf8"));
|
||||
return [skipFinding(`upstream still down (cached ${Math.round((Date.now() - s.mtimeMs) / 1000)}s ago): ${String(note.reason).slice(0, 160)}`)];
|
||||
}
|
||||
} catch { /* malformed sentinel — fall through to fresh call */ }
|
||||
}
|
||||
|
||||
// Cost cap — return cached findings if a verdict for this exact head
|
||||
// SHA was generated within the TTL.
|
||||
const cached = await loadCachedVerdict(cachePath);
|
||||
if (cached) {
|
||||
return cached.findings.length > 0
|
||||
? cached.findings
|
||||
: [{ check: "kimi_architect" as CheckKind, severity: "info", summary: "kimi_architect cached — 0 findings", evidence: [`cache: ${cachePath}`] }];
|
||||
}
|
||||
|
||||
// Alternate model based on how many audits this PR has had — gives
|
||||
// cross-lineage signal (Kimi/Moonshot ↔ Haiku/Anthropic) on every
|
||||
// PR's audit history. Count is derived from existing kimi_verdicts
|
||||
// files for this PR; cheap O(N_PRs) directory read.
|
||||
let auditIndex = 0;
|
||||
try {
|
||||
const dir = "/home/profit/lakehouse/data/_auditor/kimi_verdicts";
|
||||
if (existsSync(dir)) {
|
||||
const all = require("node:fs").readdirSync(dir) as string[];
|
||||
auditIndex = all.filter((f) => f.startsWith(`${ctx.pr_number}-`)).length;
|
||||
}
|
||||
} catch { /* default 0 — Kimi */ }
|
||||
|
||||
const selected = selectModel(diff.length, auditIndex);
|
||||
let response: { content: string; usage: any; finish_reason: string; latency_ms: number };
|
||||
try {
|
||||
response = await callKimi(buildPrompt(diff, priorFindings, ctx), selected.provider, selected.model);
|
||||
} catch (e) {
|
||||
// Negative-cache for 10 min on outage (caught 2026-04-27 by Opus
|
||||
// self-audit): without this, every audit cycle within the 24h
|
||||
// TTL re-calls upstream while it's still down. Use a sentinel
|
||||
// file with mtime check rather than persisting a verdict so the
|
||||
// happy-path cache reader doesn't have to special-case it.
|
||||
const sentinel = `${cachePath}.outage`;
|
||||
try { await writeFile(sentinel, JSON.stringify({ at: new Date().toISOString(), reason: (e as Error).message.slice(0, 200) })); } catch {}
|
||||
return [skipFinding(`kimi call failed (${selected.model}): ${(e as Error).message.slice(0, 200)}`)];
|
||||
}
|
||||
|
||||
const findings = parseFindings(response.content);
|
||||
const grounding = await computeGrounding(findings);
|
||||
|
||||
const verdict: KimiVerdictFile = {
|
||||
pr_number: ctx.pr_number,
|
||||
head_sha: ctx.head_sha,
|
||||
cached_at: new Date().toISOString(),
|
||||
model: selected.model,
|
||||
latency_ms: response.latency_ms,
|
||||
finish_reason: response.finish_reason,
|
||||
usage: {
|
||||
prompt_tokens: response.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: response.usage?.completion_tokens ?? 0,
|
||||
total_tokens: response.usage?.total_tokens ?? 0,
|
||||
},
|
||||
raw_content: response.content,
|
||||
findings,
|
||||
grounding,
|
||||
};
|
||||
|
||||
// Cache-poisoning guard (caught 2026-04-27 by Opus self-audit):
|
||||
// when parseFindings returns 0 findings (Kimi rambled, prompt too
|
||||
// big, or the markdown shape changed and our regex missed every
|
||||
// block), persisting the empty verdict short-circuits all future
|
||||
// audits in the 24h TTL window with a useless cached "0 findings"
|
||||
// result. Better to leave no cache and re-call upstream next time.
|
||||
// Always append metrics — observability shouldn't depend on whether
|
||||
// findings parsed.
|
||||
await appendMetrics(verdict);
|
||||
if (findings.length > 0) {
|
||||
await persistVerdict(cachePath, verdict);
|
||||
return findings;
|
||||
}
|
||||
return [{
|
||||
check: "kimi_architect" as CheckKind,
|
||||
severity: "info",
|
||||
summary: `kimi_architect produced 0 ranked findings (${response.finish_reason}, ${verdict.usage.completion_tokens} tokens) — not cached`,
|
||||
evidence: [`raw saved (no cache): see kimi_audits.jsonl ${verdict.cached_at}`],
|
||||
}];
|
||||
}
|
||||
|
||||
async function loadCachedVerdict(path: string): Promise<KimiVerdictFile | null> {
|
||||
if (!existsSync(path)) return null;
|
||||
try {
|
||||
const s = await stat(path);
|
||||
if (Date.now() - s.mtimeMs > CACHE_TTL_MS) return null;
|
||||
return JSON.parse(await readFile(path, "utf8")) as KimiVerdictFile;
|
||||
} catch { return null; }
|
||||
}
|
||||
|
||||
function buildPrompt(diff: string, priorFindings: Finding[], ctx: KimiArchitectContext): string {
|
||||
const truncatedDiff = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n\n... [truncated; original diff was ${diff.length} chars]`
|
||||
: diff;
|
||||
|
||||
const priorBlock = priorFindings
|
||||
.filter(f => f.severity !== "info")
|
||||
.slice(0, MAX_PRIOR_FINDINGS)
|
||||
.map(f => `- [${f.check}/${f.severity}] ${f.summary}${f.evidence?.[0] ? ` — ${f.evidence[0].slice(0, 160)}` : ""}`)
|
||||
.join("\n");
|
||||
|
||||
return `You are a senior software architect doing a second-pass review on PR #${ctx.pr_number} (head ${ctx.head_sha.slice(0, 12)}). The team's automated auditor (deepseek-v3.1:671b, N=3 consensus) already produced findings. Your job is NOT to repeat what they found — your job is to catch what their voting structure CAN'T see: compile errors, type-system bypasses, false telemetry, silent determinism leaks, schema-bypass anti-patterns, load-bearing assumptions that look fine line-by-line.
|
||||
|
||||
GROUNDING RULES (non-negotiable):
|
||||
- Cite file:line for EVERY finding. Lines you cite must actually contain what you claim. Confabulating a finding wastes more time than missing one.
|
||||
- If the diff is truncated and you can't verify a claim, say "diff-truncated, can't verify" — DO NOT guess.
|
||||
- Distinguish architectural concerns (no specific line) from concrete bugs (specific line). Don't dress one as the other.
|
||||
|
||||
PRIOR FINDINGS FROM DEEPSEEK CONSENSUS (do not repeat these):
|
||||
${priorBlock || "(none)"}
|
||||
|
||||
OUTPUT FORMAT (markdown):
|
||||
- ## Verdict (one sentence)
|
||||
- ## Findings (5-10 items, each formatted EXACTLY as below)
|
||||
|
||||
For each finding use this exact shape so a parser can lift them:
|
||||
|
||||
### F1: <one-line summary>
|
||||
- **Severity:** block | warn | info
|
||||
- **File:** path/to/file.ext:LINE
|
||||
- **Rationale:** one or two sentences
|
||||
|
||||
THE DIFF:
|
||||
|
||||
${truncatedDiff}
|
||||
`;
|
||||
}
|
||||
|
||||
async function callKimi(prompt: string, provider: string, model: string): Promise<{ content: string; usage: any; finish_reason: string; latency_ms: number }> {
|
||||
const t0 = Date.now();
|
||||
const body = JSON.stringify({
|
||||
provider,
|
||||
model,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
max_tokens: maxTokensFor(model),
|
||||
temperature: 0.2,
|
||||
});
|
||||
// curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.
|
||||
// -m sets the max transfer time honored end-to-end. Body is piped via
|
||||
// stdin to avoid argv length limits on big audit prompts (~50K+ tokens).
|
||||
const proc = Bun.spawn({
|
||||
cmd: [
|
||||
"curl", "-sS", "-X", "POST",
|
||||
"-m", String(Math.ceil(CALL_TIMEOUT_MS / 1000)),
|
||||
"-H", "content-type: application/json",
|
||||
"--data-binary", "@-",
|
||||
`${GATEWAY}/v1/chat`,
|
||||
],
|
||||
stdin: "pipe",
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
});
|
||||
proc.stdin.write(body);
|
||||
await proc.stdin.end();
|
||||
const [stdout, stderr, exitCode] = await Promise.all([
|
||||
new Response(proc.stdout).text(),
|
||||
new Response(proc.stderr).text(),
|
||||
proc.exited,
|
||||
]);
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(`curl exit ${exitCode}: ${stderr.slice(0, 300)}`);
|
||||
}
|
||||
let j: any;
|
||||
try { j = JSON.parse(stdout); }
|
||||
catch (e) {
|
||||
throw new Error(`bad response (${stdout.length} bytes): ${stdout.slice(0, 300)}`);
|
||||
}
|
||||
if (j.error || !j.choices) {
|
||||
throw new Error(`gateway error: ${JSON.stringify(j).slice(0, 300)}`);
|
||||
}
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
usage: j.usage ?? {},
|
||||
finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
|
||||
latency_ms: Date.now() - t0,
|
||||
};
|
||||
}
|
||||
|
||||
// Parse Kimi's markdown into Finding[]. Format expected (per buildPrompt):
|
||||
// ### F<N>: <summary>
|
||||
// - **Severity:** block | warn | info
|
||||
// - **File:** path:line
|
||||
// - **Rationale:** ...
|
||||
function parseFindings(content: string): Finding[] {
|
||||
const findings: Finding[] = [];
|
||||
const blocks = content.split(/^###\s+F\d+:\s*/m).slice(1);
|
||||
for (const block of blocks) {
|
||||
const summary = (block.split("\n")[0] ?? "").trim();
|
||||
if (!summary) continue;
|
||||
const sev = /\*\*Severity:\*\*\s*(block|warn|info)/i.exec(block)?.[1]?.toLowerCase();
|
||||
const fileLine = /\*\*File:\*\*\s*(\S+)/i.exec(block)?.[1] ?? "unknown";
|
||||
const rationale = /\*\*Rationale:\*\*\s*([\s\S]+?)(?=\n###|\n\*\*|$)/i.exec(block)?.[1]?.trim() ?? "";
|
||||
const severity: Finding["severity"] = sev === "block" ? "block" : sev === "warn" ? "warn" : "info";
|
||||
findings.push({
|
||||
check: "kimi_architect" as CheckKind,
|
||||
severity,
|
||||
summary: summary.slice(0, 240),
|
||||
evidence: [fileLine, rationale.slice(0, 360)].filter(Boolean),
|
||||
});
|
||||
}
|
||||
return findings;
|
||||
}
|
||||
|
||||
// For each finding's cited file:line, grep the actual file to verify
|
||||
// the line exists. Returns total + verified counts; per-finding metadata
|
||||
// is appended into the evidence array so the reader can see which
|
||||
// citations were verified.
|
||||
async function computeGrounding(findings: Finding[]): Promise<{ total: number; verified: number; rate: number }> {
|
||||
// readFile (async) instead of readFileSync — caught 2026-04-27 by
|
||||
// Kimi's self-audit. Sync I/O in an async fn blocks the event loop
|
||||
// for every cited file; doesn't matter at 10 findings, would matter
|
||||
// at 100+.
|
||||
const checks = await Promise.all(findings.map(async (f) => {
|
||||
const cite = f.evidence[0] ?? "";
|
||||
const m = /^(\S+?):(\d+)/.exec(cite);
|
||||
if (!m) return false;
|
||||
const [, relpath, lineStr] = m;
|
||||
const line = Number(lineStr);
|
||||
if (!line || !relpath) return false;
|
||||
|
||||
// Path-traversal guard, two-layer (caught 2026-04-27 by Kimi
|
||||
// self-audits on dd77632 then 2d9cb12).
|
||||
//
|
||||
// Layer 1 (lexical): resolve() normalizes `..` segments. Refuse
|
||||
// any path that doesn't anchor under REPO_ROOT.
|
||||
//
|
||||
// Layer 2 (symlink): even if the lexical path is anchored, it
|
||||
// could be a symlink whose target escapes. realpath() resolves
|
||||
// symlinks; compare the real path against REPO_ROOT_REAL.
|
||||
//
|
||||
// Both layers exist because attackers might bypass either alone:
|
||||
// raw `../etc/passwd` triggers layer 1; a planted symlink at
|
||||
// ./safe-looking-name → /etc/passwd triggers layer 2.
|
||||
const abs = resolve(REPO_ROOT, relpath);
|
||||
if (!abs.startsWith(REPO_ROOT + "/") && abs !== REPO_ROOT) {
|
||||
f.evidence.push(`[grounding: path escapes repo root, refusing]`);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!existsSync(abs)) {
|
||||
f.evidence.push("[grounding: file not found]");
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
// Symlink-resolution check before any read. realpath() throws
|
||||
// if the file doesn't exist; existsSync above shields the
|
||||
// common case but a TOCTOU race could still error here — the
|
||||
// outer catch handles it.
|
||||
const realPath = await realpath(abs);
|
||||
if (!realPath.startsWith(REPO_ROOT_REAL + "/") && realPath !== REPO_ROOT_REAL) {
|
||||
f.evidence.push(`[grounding: symlink target escapes repo root, refusing]`);
|
||||
return false;
|
||||
}
|
||||
const lines = (await readFile(realPath, "utf8")).split("\n");
|
||||
if (line < 1 || line > lines.length) {
|
||||
f.evidence.push(`[grounding: line ${line} > EOF (${lines.length})]`);
|
||||
return false;
|
||||
}
|
||||
f.evidence.push(`[grounding: verified at ${relpath}:${line}]`);
|
||||
return true;
|
||||
} catch (e) {
|
||||
f.evidence.push(`[grounding: read failed: ${(e as Error).message.slice(0, 80)}]`);
|
||||
return false;
|
||||
}
|
||||
}));
|
||||
const verified = checks.filter(Boolean).length;
|
||||
const total = findings.length;
|
||||
return { total, verified, rate: total === 0 ? 0 : verified / total };
|
||||
}
|
||||
|
||||
async function persistVerdict(path: string, v: KimiVerdictFile): Promise<void> {
|
||||
await mkdir(KIMI_VERDICTS_DIR, { recursive: true });
|
||||
await writeFile(path, JSON.stringify(v, null, 2));
|
||||
}
|
||||
|
||||
async function appendMetrics(v: KimiVerdictFile): Promise<void> {
|
||||
// dirname() instead of join(path, "..") — caught 2026-04-27 by both
|
||||
// Haiku and Opus self-audits. The "/.." idiom resolves correctly
|
||||
// via Node path normalization but is non-idiomatic + breaks if the
|
||||
// path ever has trailing dots.
|
||||
await mkdir(dirname(KIMI_AUDITS_JSONL), { recursive: true });
|
||||
await appendFile(KIMI_AUDITS_JSONL, JSON.stringify({
|
||||
pr_number: v.pr_number,
|
||||
head_sha: v.head_sha,
|
||||
audited_at: v.cached_at,
|
||||
model: v.model,
|
||||
latency_ms: v.latency_ms,
|
||||
finish_reason: v.finish_reason,
|
||||
prompt_tokens: v.usage.prompt_tokens,
|
||||
completion_tokens: v.usage.completion_tokens,
|
||||
findings_total: v.findings.length,
|
||||
findings_block: v.findings.filter(f => f.severity === "block").length,
|
||||
findings_warn: v.findings.filter(f => f.severity === "warn").length,
|
||||
grounding_verified: v.grounding.verified,
|
||||
grounding_rate: Number(v.grounding.rate.toFixed(3)),
|
||||
}) + "\n");
|
||||
}
|
||||
|
||||
function skipFinding(why: string): Finding {
|
||||
return {
|
||||
check: "kimi_architect" as CheckKind,
|
||||
severity: "info",
|
||||
summary: `kimi_architect skipped — ${why}`,
|
||||
evidence: [why],
|
||||
};
|
||||
}
|
||||
@ -54,87 +54,43 @@ export function runStaticCheck(diff: string): Finding[] {
|
||||
const isAuditorCheckerFile = path.startsWith("auditor/checks/") ||
|
||||
path.startsWith("auditor/fixtures/");
|
||||
|
||||
// Track multi-line backtick-template state across the file. Walks
|
||||
// all post-merge lines (context + added, skipping removed lines)
|
||||
// in order and keeps `inMultilineBacktick` flipping on each
|
||||
// unescaped backtick. Pre-2026-04-26 the per-line walk in
|
||||
// isInsideQuotedString missed `todo!()` matches inside docstring
|
||||
// template literals because the opening backtick lived on a
|
||||
// line above the match. Now we OR the file-level state into the
|
||||
// per-line check.
|
||||
let inMultilineBacktick = false;
|
||||
|
||||
for (let idx = 0; idx < lines.length; idx++) {
|
||||
const line = lines[idx];
|
||||
if (!line.startsWith("+") || line.startsWith("+++")) continue;
|
||||
const added = line.slice(1);
|
||||
|
||||
// Diff bookkeeping lines and removed lines don't contribute to
|
||||
// the post-merge file's string state.
|
||||
if (line.startsWith("+++") || line.startsWith("---") ||
|
||||
line.startsWith("@@") || line.startsWith("\\ No newline")) continue;
|
||||
if (line.startsWith("-")) continue;
|
||||
|
||||
const isAdded = line.startsWith("+");
|
||||
// Strip the diff prefix (' ' for context, '+' for added).
|
||||
const body = (isAdded || line.startsWith(" ")) ? line.slice(1) : line;
|
||||
|
||||
// Compute the file-level backtick state ENTERING this line.
|
||||
// The state machine sees pattern matches against the right
|
||||
// context: a line that opens a backtick block has its own
|
||||
// pattern checks evaluated under "inside-backtick" semantics
|
||||
// for the portion AFTER the opening tick. Pre-2026-04-27 the
|
||||
// state was updated AFTER the pattern checks, so the FIRST
|
||||
// pattern on a backtick-opening line slipped through with
|
||||
// stale "outside-backtick" semantics. Caught by Kimi self-audit.
|
||||
const stateAtLineStart = inMultilineBacktick;
|
||||
const stateAtLineEnd = updateBacktickState(body, stateAtLineStart);
|
||||
|
||||
if (isAdded) {
|
||||
const added = body;
|
||||
|
||||
if (!isAuditorCheckerFile) {
|
||||
for (const { re, why } of BLOCK_PATTERNS) {
|
||||
const m = added.match(re);
|
||||
if (m && typeof m.index === "number") {
|
||||
// Skip if EITHER (a) the file was already inside a
|
||||
// multi-line backtick block when this line started, OR
|
||||
// (b) the match sits inside a quoted string literal on
|
||||
// THIS line. The earlier code only checked stateAtLineStart;
|
||||
// now we also check that the match isn't past the
|
||||
// opening backtick of a block that opens on this line.
|
||||
if (stateAtLineStart || isInsideQuotedString(added, m.index)) continue;
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "block",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const { re, why } of WARN_COMMENT_PATTERNS) {
|
||||
if (re.test(line)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "warn",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
for (const { re, why } of INFO_HARDCODED_PATTERNS) {
|
||||
if (!isAuditorCheckerFile) {
|
||||
for (const { re, why } of BLOCK_PATTERNS) {
|
||||
if (re.test(added)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "info",
|
||||
severity: "block",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Carry the end-of-line state forward to the next iteration.
|
||||
inMultilineBacktick = stateAtLineEnd;
|
||||
for (const { re, why } of WARN_COMMENT_PATTERNS) {
|
||||
if (re.test(line)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "warn",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
for (const { re, why } of INFO_HARDCODED_PATTERNS) {
|
||||
if (re.test(added)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "info",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// "Field added but never read" heuristic — catches exactly the
|
||||
@ -143,20 +99,10 @@ export function runStaticCheck(diff: string): Finding[] {
|
||||
// elsewhere might exist). The point is: if NEITHER this diff nor
|
||||
// any other line in the diff reads the field, the PR is shipping
|
||||
// state without a consumer.
|
||||
//
|
||||
// Serde exemption: if the field's parent struct derives Serialize
|
||||
// or Deserialize, the read-site is the macro itself — JSON
|
||||
// round-trips consume every public field. Without this exemption
|
||||
// the check produces false positives on every response/request
|
||||
// struct shipped through `/v1/*`.
|
||||
const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++"))
|
||||
.map(l => l.slice(1));
|
||||
const newFields = extractNewFieldsWithLine(lines);
|
||||
const seenNames = new Set<string>();
|
||||
for (const { name: field, lineIdx } of newFields) {
|
||||
if (seenNames.has(field)) continue;
|
||||
seenNames.add(field);
|
||||
if (parentStructHasSerdeDerive(lines, lineIdx)) continue;
|
||||
const newFields = extractNewFields(addedLines);
|
||||
for (const field of newFields) {
|
||||
const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`);
|
||||
// The definition line itself matches readPattern — filter it out
|
||||
// by requiring at least TWO lines in the diff mention the field
|
||||
@ -194,116 +140,18 @@ function splitDiffByFile(diff: string): Map<string, string[]> {
|
||||
return out;
|
||||
}
|
||||
|
||||
// Extract new `pub name: Type,` fields from the per-file diff lines,
|
||||
// keeping each occurrence's line index so the caller can resolve the
|
||||
// parent struct. Same narrow rules as before: starts with `pub `,
|
||||
// excludes `pub fn` / `pub struct` / etc.
|
||||
function extractNewFieldsWithLine(lines: string[]): Array<{ name: string; lineIdx: number }> {
|
||||
const out: Array<{ name: string; lineIdx: number }> = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
if (!line.startsWith("+") || line.startsWith("+++")) continue;
|
||||
const t = line.slice(1).trim();
|
||||
// Extract new `pub name: Type,` fields from added lines. Rust syntax.
|
||||
// Narrowly-scoped: only matches at the start of a trimmed line,
|
||||
// requires `pub ` prefix, ignores `pub fn` / `pub struct` / etc.
|
||||
function extractNewFields(addedLines: string[]): string[] {
|
||||
const fields = new Set<string>();
|
||||
for (const line of addedLines) {
|
||||
const t = line.trim();
|
||||
// pub NAME: Type,
|
||||
const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/);
|
||||
if (m) out.push({ name: m[1], lineIdx: i });
|
||||
if (m) fields.add(m[1]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// True if the field at `fieldLineIdx` lives inside a struct whose
|
||||
// declaration carries `#[derive(... Serialize|Deserialize ...)]`. We
|
||||
// walk backward through the diff (added + context lines both count —
|
||||
// a struct declaration unchanged by the PR still appears as context)
|
||||
// to find the nearest `pub struct` boundary, then scan a few lines
|
||||
// above it for derive attributes. Conservative bounds:
|
||||
// - 80 lines back to find `struct` (struct definitions can grow large)
|
||||
// - 8 lines above the `struct` keyword for attribute lines
|
||||
// Stops the struct-search early if we hit a `}` at zero indent
|
||||
// (the previous scope) or another `pub struct` (we left ours).
|
||||
function parentStructHasSerdeDerive(lines: string[], fieldLineIdx: number): boolean {
|
||||
// Bounds-check fieldLineIdx (caught 2026-04-27 by Kimi self-audit).
|
||||
// Pre-fix: if fieldLineIdx >= lines.length, the loop ran from a
|
||||
// negative implicit upper bound (fieldLineIdx - 80 could be > 0
|
||||
// even when fieldLineIdx is past EOF) and read undefined slots.
|
||||
// Defensive: bail early on out-of-range input.
|
||||
if (fieldLineIdx < 0 || fieldLineIdx >= lines.length) return false;
|
||||
|
||||
let structLineIdx = -1;
|
||||
for (let i = fieldLineIdx - 1; i >= 0 && i >= fieldLineIdx - 80; i--) {
|
||||
const raw = lines[i];
|
||||
if (typeof raw !== "string" || raw.length === 0) continue;
|
||||
const body = stripDiffPrefix(raw);
|
||||
const trimmed = body.trim();
|
||||
if (/^pub\s+struct\s+\w/.test(trimmed)) {
|
||||
structLineIdx = i;
|
||||
break;
|
||||
}
|
||||
// Closing brace at column 0 means the enclosing scope ended above
|
||||
// the field — we're not actually inside a struct.
|
||||
if (body.startsWith("}")) return false;
|
||||
}
|
||||
if (structLineIdx < 0) return false;
|
||||
|
||||
for (let j = structLineIdx - 1; j >= 0 && j >= structLineIdx - 8; j--) {
|
||||
const raw = lines[j];
|
||||
if (typeof raw !== "string") continue;
|
||||
const trimmed = stripDiffPrefix(raw).trim();
|
||||
if (trimmed === "" || trimmed.startsWith("//") || trimmed.startsWith("///")) continue;
|
||||
if (!trimmed.startsWith("#[")) break;
|
||||
if (/derive\s*\([^)]*\b(Serialize|Deserialize)\b/.test(trimmed)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Strip leading +/-/space from a unified-diff line, leaving the raw
|
||||
// source line. Handles the case where the line is shorter than 1 char
|
||||
// (rare but real for empty-context lines).
|
||||
function stripDiffPrefix(line: string): string {
|
||||
if (line.length === 0) return line;
|
||||
const c = line[0];
|
||||
if (c === "+" || c === "-" || c === " ") return line.slice(1);
|
||||
return line;
|
||||
}
|
||||
|
||||
// Walk a single line and toggle the cross-line backtick state on each
|
||||
// unescaped backtick. Single-quote and double-quote runs are line-
|
||||
// bounded in JS/TS/Rust by language rules (string literals don't span
|
||||
// newlines without explicit `\` continuation), so we only track
|
||||
// backticks across lines. Returns the new state for the next line.
|
||||
function updateBacktickState(line: string, inBacktick: boolean): boolean {
|
||||
let state = inBacktick;
|
||||
let inDouble = false;
|
||||
let inSingle = false;
|
||||
for (let i = 0; i < line.length; i++) {
|
||||
const c = line[i];
|
||||
const esc = i > 0 && line[i - 1] === "\\";
|
||||
if (esc) continue;
|
||||
// Inside a multi-line backtick template, single/double quotes
|
||||
// don't open new strings — they're literal characters of the
|
||||
// template. Same applies the other way around.
|
||||
if (c === '"' && !inSingle && !state) inDouble = !inDouble;
|
||||
else if (c === "'" && !inDouble && !state) inSingle = !inSingle;
|
||||
else if (c === "`" && !inDouble && !inSingle) state = !state;
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
// True if `pos` falls inside a double- or single-quoted string on this
|
||||
// line (backtick template literals too). Walks left→right toggling the
|
||||
// "in quote" state on each unescaped quote. Per-line only — the file-
|
||||
// level walk in runStaticCheck handles multi-line backtick templates
|
||||
// via updateBacktickState.
|
||||
function isInsideQuotedString(line: string, pos: number): boolean {
|
||||
let inDouble = false, inSingle = false, inBacktick = false;
|
||||
for (let i = 0; i < pos; i++) {
|
||||
const c = line[i];
|
||||
const esc = i > 0 && line[i - 1] === "\\";
|
||||
if (esc) continue;
|
||||
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
|
||||
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
|
||||
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
|
||||
}
|
||||
return inDouble || inSingle || inBacktick;
|
||||
return Array.from(fields);
|
||||
}
|
||||
|
||||
function escape(s: string): string {
|
||||
|
||||
@ -49,58 +49,6 @@ const WEAK_PATTERNS: RegExp[] = [
|
||||
/\bprobably\b/i,
|
||||
];
|
||||
|
||||
// Empirical claims: runtime measurements / observed outcomes that can't
|
||||
// be verified from a diff (only from the actual run that produced
|
||||
// them). Classifying as empirical lets the inference check skip
|
||||
// diff-verification and saves the ladder for falsifiable claims.
|
||||
//
|
||||
// Two classes share this bucket because they share the skip discipline:
|
||||
//
|
||||
// 1. Runtime metrics — "58 cloud calls", "306s end-to-end"
|
||||
// 2. History/proof refs — "verified on PR #8", "was flipping across runs"
|
||||
//
|
||||
// Both are assertions about state outside the current diff. The cloud
|
||||
// would flag them as "not backed" — but that's a false positive: the
|
||||
// proof lives in the referenced run, prior commit, or test output, not
|
||||
// in the added lines the cloud is reading.
|
||||
const EMPIRICAL_PATTERNS: RegExp[] = [
|
||||
// ─── Runtime metrics ───
|
||||
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
|
||||
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
|
||||
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
|
||||
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
|
||||
/\b\d+\s+(cloud\s+)?calls?\b/i,
|
||||
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
|
||||
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
|
||||
// "escalated through N tiers", "N distinct models"
|
||||
/\bescalated\s+through\s+\d+\b/i,
|
||||
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
|
||||
|
||||
// ─── History / proof references ───
|
||||
// "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4"
|
||||
// Require PR#N / commit-hash / "prior <word>" to avoid matching
|
||||
// "verified ... in production" (PR without \b-ish anchor previously
|
||||
// consumed "pr" of "production").
|
||||
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+|the\s+\w+\s+audit)\b/i,
|
||||
/\btested\s+(?:against|in|on)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+)\b/i,
|
||||
// Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123"
|
||||
/\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i,
|
||||
/\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i,
|
||||
// Observational descriptions of prior behavior: "was flipping", "was X before", "previously observed"
|
||||
/\b(?:was|were)\s+(?:flipping|drifting|inconsistent|non[- ]deterministic|creeping)\b/i,
|
||||
/\bpreviously\s+(?:observed|flagged|reported|seen|landed)\b/i,
|
||||
/\bused\s+to\s+(?:flip|fail|flag|reject|block)\b/i,
|
||||
/\bobserved\s+(?:in|during|on|across)\s+(?:PR|prior|\d+\s+(?:runs?|audits?))/i,
|
||||
// "flipping/drifting across N runs" — historical variance description
|
||||
/\b(?:flipping|drifting|varying|oscillating)\s+across\s+(?:\d+\s+)?(?:runs?|audits?|iterations?)\b/i,
|
||||
// "the proven X" referring to prior work (proven is a STRONG pattern
|
||||
// but in context "the proven FOO" is usually a historical reference,
|
||||
// not a fresh claim). We catch it here so the empirical skip wins.
|
||||
/\bthe\s+proven\s+(?:escalation\s+ladder|pipeline|flow|loop|tier|path)/i,
|
||||
// "from the 9-run test", "across the 5-run validation"
|
||||
/\b(?:from|across|in|during)\s+the\s+\d+[- ]run\s+(?:test|validation|probe|experiment)/i,
|
||||
];
|
||||
|
||||
export interface ParsedClaims {
|
||||
claims: Claim[];
|
||||
commits_scanned: number;
|
||||
@ -129,22 +77,9 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
const line = lines[i];
|
||||
if (line.length < 3) continue;
|
||||
|
||||
// Empirical match wins over everything else — if a line ALSO
|
||||
// contains a moderate word like "complete", we still want to
|
||||
// classify it as empirical so the inference check doesn't ask
|
||||
// the cloud to prove "58 cloud calls" from the diff. Order:
|
||||
// empirical → strong → moderate → weak.
|
||||
const empirical = firstUnquotedMatch(line, EMPIRICAL_PATTERNS);
|
||||
if (empirical) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "empirical",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const strong = firstUnquotedMatch(line, STRONG_PATTERNS);
|
||||
// Strong patterns first — if a line matches strong, it's strong,
|
||||
// don't double-count as moderate.
|
||||
const strong = firstMatch(line, STRONG_PATTERNS);
|
||||
if (strong) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
@ -154,7 +89,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const moderate = firstUnquotedMatch(line, MODERATE_PATTERNS);
|
||||
const moderate = firstMatch(line, MODERATE_PATTERNS);
|
||||
if (moderate) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
@ -164,7 +99,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const weak = firstUnquotedMatch(line, WEAK_PATTERNS);
|
||||
const weak = firstMatch(line, WEAK_PATTERNS);
|
||||
if (weak) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
@ -176,35 +111,9 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
}
|
||||
}
|
||||
|
||||
// Match a pattern only when its match position is NOT inside a quoted
|
||||
// string on the line. Mirrors the same guard in auditor/checks/static.ts
|
||||
// — the two files have the same false-positive class: PR authors
|
||||
// quote pattern examples in commit message bodies (e.g. `"Phase 45
|
||||
// shipped"` as a test example) and without this guard those quoted
|
||||
// references get flagged as fresh ship-claims. Only skips when the
|
||||
// match itself falls inside quotes; real (unquoted) uses of the same
|
||||
// vocabulary still classify correctly.
|
||||
function firstUnquotedMatch(text: string, patterns: RegExp[]): RegExp | null {
|
||||
function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
|
||||
for (const p of patterns) {
|
||||
const m = text.match(p);
|
||||
if (!m || typeof m.index !== "number") continue;
|
||||
if (isInsideQuotedString(text, m.index)) continue;
|
||||
return p;
|
||||
if (p.test(text)) return p;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Walks left→right toggling in-quote state on each unescaped quote.
|
||||
// Good enough for single-line claims; multi-line strings aren't parsed.
|
||||
function isInsideQuotedString(line: string, pos: number): boolean {
|
||||
let inDouble = false, inSingle = false, inBacktick = false;
|
||||
for (let i = 0; i < pos; i++) {
|
||||
const c = line[i];
|
||||
const esc = i > 0 && line[i - 1] === "\\";
|
||||
if (esc) continue;
|
||||
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
|
||||
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
|
||||
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
|
||||
}
|
||||
return inDouble || inSingle || inBacktick;
|
||||
}
|
||||
|
||||
@ -1,271 +0,0 @@
|
||||
// fact_extractor — routes curated TEXT through llm_team_ui's
|
||||
// "knowledge extract facts" mode (mode=extract at /api/run).
|
||||
//
|
||||
// What it gives us: structured {facts, entities, relationships} from
|
||||
// whatever curated blob we send. Auditor sends the tree-split
|
||||
// inference scratchpad (the best distillation of what a PR changed).
|
||||
// Scrum_master will later send its accepted review bodies.
|
||||
//
|
||||
// Why route through llm_team and not just extract directly from our
|
||||
// own checks: llm_team's extract uses a local EXTRACTOR model
|
||||
// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the
|
||||
// discipline J wants for knowledge going into the playbook — facts
|
||||
// go in only after a second model has rated them CORRECT /
|
||||
// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the
|
||||
// codereview pattern J already trusts.
|
||||
//
|
||||
// SSE parsing: llm_team streams SSE events. We're only interested in
|
||||
// the final "response" event with role="final" + the extraction
|
||||
// response (role="extraction N"). Parse the JSON from the extractor's
|
||||
// response text.
|
||||
|
||||
const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
|
||||
const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest";
|
||||
const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest";
|
||||
const EXTRACT_TIMEOUT_MS = 120_000;
|
||||
const PROJECT_CONTEXT_FILE = process.env.LH_AUDITOR_CONTEXT_FILE
|
||||
?? "/home/profit/lakehouse/docs/AUDITOR_CONTEXT.md";
|
||||
|
||||
let cachedContext: string | null = null;
|
||||
async function loadProjectContext(): Promise<string> {
|
||||
if (cachedContext !== null) return cachedContext;
|
||||
try {
|
||||
const { readFile } = await import("node:fs/promises");
|
||||
const raw = await readFile(PROJECT_CONTEXT_FILE, "utf8");
|
||||
// Cap at 4KB — anything past that is more noise than signal for
|
||||
// the extractor/verifier's attention budget.
|
||||
cachedContext = raw.slice(0, 4000);
|
||||
} catch {
|
||||
cachedContext = ""; // context file missing → extractor runs without preamble
|
||||
}
|
||||
return cachedContext;
|
||||
}
|
||||
|
||||
export interface Entity {
|
||||
name: string;
|
||||
type: string;
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface Relationship {
|
||||
from: string;
|
||||
to: string;
|
||||
type: string;
|
||||
}
|
||||
|
||||
export interface ExtractedFacts {
|
||||
facts: string[];
|
||||
entities: Entity[];
|
||||
relationships: Relationship[];
|
||||
verification: string;
|
||||
extractor_model: string;
|
||||
verifier_model: string;
|
||||
source_preview: string;
|
||||
// Populated when the extract run completed server-side (llm_team
|
||||
// persists to its own team_runs; this is for our own cross-ref).
|
||||
llm_team_run_id?: number;
|
||||
extracted_at: string;
|
||||
// Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
|
||||
// UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
|
||||
// pre-drop so operators can see which verdicts mapped to dropped
|
||||
// facts if needed.
|
||||
verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
|
||||
facts_dropped_by_verifier?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the llm_team extract pipeline on `source` text. Returns
|
||||
* structured {facts, entities, relationships}.
|
||||
*
|
||||
* Returns an object with `error` set if the pipeline failed — never
|
||||
* throws, because fact extraction is best-effort enrichment (the
|
||||
* primary audit must not break if llm_team is down).
|
||||
*/
|
||||
export async function extractFacts(source: string): Promise<ExtractedFacts> {
|
||||
const base: ExtractedFacts = {
|
||||
facts: [],
|
||||
entities: [],
|
||||
relationships: [],
|
||||
verification: "",
|
||||
extractor_model: EXTRACTOR,
|
||||
verifier_model: VERIFIER,
|
||||
source_preview: source.slice(0, 240),
|
||||
extracted_at: new Date().toISOString(),
|
||||
};
|
||||
|
||||
// Prepend project context to the source so the extractor + verifier
|
||||
// know what codebase/framework these facts belong to. Without this,
|
||||
// the verifier marks most domain-specific facts as UNVERIFIABLE ("I
|
||||
// don't know what Lakehouse is"). With it, the verifier can CORRECT-
|
||||
// stamp facts that align with the stated architecture.
|
||||
const context = await loadProjectContext();
|
||||
const prompt = context.length > 0
|
||||
? `=== PROJECT CONTEXT (for grounding facts; do NOT extract facts from this section) ===\n${context}\n\n=== CONTENT TO EXTRACT FACTS FROM ===\n${source}`
|
||||
: source;
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`${LLM_TEAM}/api/run`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
mode: "extract",
|
||||
prompt,
|
||||
extractor: EXTRACTOR,
|
||||
verifier: VERIFIER,
|
||||
source: "prompt",
|
||||
skip_cache: true, // cache by prompt would dedup identical
|
||||
// scratchpads, but we want fresh extraction
|
||||
// for per-audit facts; cheap since local.
|
||||
}),
|
||||
signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
|
||||
});
|
||||
} catch (e) {
|
||||
return { ...base, error: `fetch failed: ${(e as Error).message}` };
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text().catch(() => "");
|
||||
return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` };
|
||||
}
|
||||
|
||||
// Stream SSE lines; collect the one extraction response + the run_saved event
|
||||
// so we can capture the team-runs ID for cross-ref.
|
||||
const decoder = new TextDecoder();
|
||||
const reader = resp.body?.getReader();
|
||||
if (!reader) return { ...base, error: "no response body" };
|
||||
|
||||
let buffer = "";
|
||||
let extractionText = "";
|
||||
let verifierText = "";
|
||||
let runId: number | undefined = undefined;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
let nl: number;
|
||||
while ((nl = buffer.indexOf("\n\n")) >= 0) {
|
||||
const chunk = buffer.slice(0, nl);
|
||||
buffer = buffer.slice(nl + 2);
|
||||
const dataLine = chunk.split("\n").find(l => l.startsWith("data: "));
|
||||
if (!dataLine) continue;
|
||||
try {
|
||||
const ev = JSON.parse(dataLine.slice(6));
|
||||
if (ev.type === "response") {
|
||||
const role = String(ev.role ?? "");
|
||||
if (role.startsWith("extraction")) extractionText = String(ev.text ?? "");
|
||||
else if (role === "verifier") verifierText = String(ev.text ?? "");
|
||||
} else if (ev.type === "run_saved") {
|
||||
const id = Number(ev.run_id);
|
||||
if (Number.isFinite(id)) runId = id;
|
||||
}
|
||||
} catch { /* skip malformed SSE */ }
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
return { ...base, error: `SSE read failed: ${(e as Error).message}` };
|
||||
}
|
||||
|
||||
// Pull the JSON object out of extractionText (may be wrapped in ```json fences).
|
||||
const parsed = extractFirstJsonObject(extractionText);
|
||||
if (!parsed) {
|
||||
return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
|
||||
}
|
||||
|
||||
const rawFacts: string[] = Array.isArray(parsed.facts)
|
||||
? parsed.facts.slice(0, 50).map(String)
|
||||
: [];
|
||||
|
||||
// Parse the verifier's free-form prose into per-fact verdicts, then
|
||||
// drop any fact the verifier explicitly marked INCORRECT. Leave
|
||||
// UNVERIFIABLE in place: many of our extractions are domain-specific
|
||||
// (Lakehouse internals) and the verifier has no prior-knowledge
|
||||
// anchor, so UNVERIFIABLE is the expected verdict for new signal,
|
||||
// not a quality fail. This is verifier-gated persistence: drop only
|
||||
// what's affirmatively wrong, not what's novel.
|
||||
const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
|
||||
const incorrectIdx = new Set<number>();
|
||||
verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
|
||||
const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));
|
||||
|
||||
return {
|
||||
...base,
|
||||
facts: kept,
|
||||
entities: Array.isArray(parsed.entities)
|
||||
? parsed.entities.slice(0, 30).map((e: any) => ({
|
||||
name: String(e?.name ?? ""),
|
||||
type: String(e?.type ?? ""),
|
||||
description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined,
|
||||
})).filter(e => e.name.length > 0)
|
||||
: [],
|
||||
relationships: Array.isArray(parsed.relationships)
|
||||
? parsed.relationships.slice(0, 30).map((r: any) => ({
|
||||
from: String(r?.from ?? ""),
|
||||
to: String(r?.to ?? ""),
|
||||
type: String(r?.type ?? ""),
|
||||
})).filter(r => r.from.length > 0 && r.to.length > 0)
|
||||
: [],
|
||||
verification: verifierText.slice(0, 1500),
|
||||
facts_dropped_by_verifier: incorrectIdx.size,
|
||||
verifier_verdicts: verdicts,
|
||||
llm_team_run_id: runId,
|
||||
};
|
||||
}
|
||||
|
||||
// Parse verifier's free-form output into a per-fact verdict array.
|
||||
// Gemma2 uses several formats depending on prompt mood:
|
||||
// Format A: **1.** claim... * **Verdict:** CORRECT
|
||||
// Format B: **1.** claim... * **CORRECT** (no "Verdict:" label)
|
||||
// Format C: 1. claim... CORRECT
|
||||
// Strategy: split on fact numbers, then find the first
|
||||
// CORRECT|INCORRECT|UNVERIFIABLE token in each section. Handles all
|
||||
// three formats without regex gymnastics.
|
||||
function parseVerifierVerdicts(
|
||||
verifierText: string,
|
||||
numFacts: number,
|
||||
): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
|
||||
const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
|
||||
Array(numFacts).fill("UNCHECKED");
|
||||
if (!verifierText) return out;
|
||||
|
||||
// Find each fact section start — "**N.**" or "N." at line start —
|
||||
// and slice out the content up to the NEXT fact number. Each section
|
||||
// gets scanned for the first CORRECT/INCORRECT/UNVERIFIABLE token.
|
||||
const starts: Array<{ idx: number; pos: number }> = [];
|
||||
const header = /(?:^|\n)\s*(?:\*\*)?(\d+)[.)]/g;
|
||||
for (const m of verifierText.matchAll(header)) {
|
||||
const factNum = Number(m[1]);
|
||||
if (!Number.isFinite(factNum)) continue;
|
||||
starts.push({ idx: factNum - 1, pos: m.index! });
|
||||
}
|
||||
for (let i = 0; i < starts.length; i++) {
|
||||
const s = starts[i];
|
||||
const end = i + 1 < starts.length ? starts[i + 1].pos : verifierText.length;
|
||||
if (s.idx < 0 || s.idx >= numFacts) continue;
|
||||
const section = verifierText.slice(s.pos, end);
|
||||
const v = section.match(/\b(CORRECT|INCORRECT|UNVERIFIABLE)\b/i);
|
||||
if (v) out[s.idx] = v[1].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of (possibly fenced) text.
|
||||
// Same discipline as inference.ts::extractJson.
|
||||
function extractFirstJsonObject(text: string): any | null {
|
||||
const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
|
||||
let depth = 0, start = -1;
|
||||
for (let i = 0; i < cleaned.length; i++) {
|
||||
const c = cleaned[i];
|
||||
if (c === "{") { if (depth === 0) start = i; depth++; }
|
||||
else if (c === "}") {
|
||||
depth--;
|
||||
if (depth === 0 && start >= 0) {
|
||||
try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@ -24,30 +24,14 @@ const POLL_INTERVAL_MS = 90_000; // 90s — enough budget for audit runs to comp
|
||||
const PAUSE_FILE = "/home/profit/lakehouse/auditor.paused";
|
||||
const STATE_FILE = "/home/profit/lakehouse/data/_auditor/state.json";
|
||||
|
||||
// Per-PR audit cap. Prevents the daemon from running away on a PR
|
||||
// when each push surfaces new findings — operator wants to review
|
||||
// in batch, not have the daemon burn budget while they're away.
|
||||
// Default 3 audits per PR. Override via LH_AUDITOR_MAX_AUDITS_PER_PR.
|
||||
// Set to 0 to disable the cap.
|
||||
//
|
||||
// Reset (after manual review): edit data/_auditor/state.json and
|
||||
// set audit_count_per_pr.<N> = 0 (or delete the key). Daemon picks
|
||||
// up the change on the next cycle without restart.
|
||||
const MAX_AUDITS_PER_PR = Number(process.env.LH_AUDITOR_MAX_AUDITS_PER_PR) || 3;
|
||||
|
||||
interface State {
|
||||
// Map: PR number → last-audited head SHA. Lets us dedupe audits
|
||||
// across restarts (poller can crash/restart without re-auditing
|
||||
// all open PRs from scratch).
|
||||
last_audited: Record<string, string>;
|
||||
// Map: PR number → number of audits run on that PR since last reset.
|
||||
// Daemon halts auditing a PR once this hits MAX_AUDITS_PER_PR.
|
||||
// Operator clears the entry to resume.
|
||||
audit_count_per_pr: Record<string, number>;
|
||||
started_at: string;
|
||||
cycles_total: number;
|
||||
cycles_skipped_paused: number;
|
||||
cycles_skipped_capped: number;
|
||||
audits_run: number;
|
||||
last_cycle_at?: string;
|
||||
}
|
||||
@ -63,21 +47,17 @@ async function loadState(): Promise<State> {
|
||||
return {
|
||||
last_audited: s.last_audited ?? {},
|
||||
started_at: s.started_at ?? new Date().toISOString(),
|
||||
audit_count_per_pr: s.audit_count_per_pr ?? {},
|
||||
cycles_total: s.cycles_total ?? 0,
|
||||
cycles_skipped_paused: s.cycles_skipped_paused ?? 0,
|
||||
cycles_skipped_capped: s.cycles_skipped_capped ?? 0,
|
||||
audits_run: s.audits_run ?? 0,
|
||||
last_cycle_at: s.last_cycle_at,
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
last_audited: {},
|
||||
audit_count_per_pr: {},
|
||||
started_at: new Date().toISOString(),
|
||||
cycles_total: 0,
|
||||
cycles_skipped_paused: 0,
|
||||
cycles_skipped_capped: 0,
|
||||
audits_run: 0,
|
||||
};
|
||||
}
|
||||
@ -109,38 +89,12 @@ async function runCycle(state: State): Promise<State> {
|
||||
console.log(`[auditor] cycle ${state.cycles_total}: ${prs.length} open PR(s)`);
|
||||
|
||||
for (const pr of prs) {
|
||||
const prKey = String(pr.number);
|
||||
const last = state.last_audited[prKey];
|
||||
const last = state.last_audited[String(pr.number)];
|
||||
if (last === pr.head_sha) {
|
||||
console.log(`[auditor] skip PR #${pr.number} (SHA ${pr.head_sha.slice(0, 8)} already audited)`);
|
||||
continue;
|
||||
}
|
||||
// Per-head-SHA audit cap. Each new push gets MAX_AUDITS_PER_PR
|
||||
// fresh attempts; the counter auto-resets when the head SHA
|
||||
// changes. Operator only intervenes manually if a single SHA
|
||||
// somehow needs MORE than the cap (rare — usually transient
|
||||
// upstream errors clear themselves inside 3 attempts).
|
||||
//
|
||||
// Reset rule: if `last` exists (we've seen this PR before) AND
|
||||
// pr.head_sha != last, that's a new push. Drop the counter.
|
||||
// The dedup branch above already handles same-SHA → skip, so
|
||||
// we only land here when the SHA actually moved.
|
||||
if (last !== undefined && (state.audit_count_per_pr[prKey] ?? 0) > 0) {
|
||||
const prior_count = state.audit_count_per_pr[prKey];
|
||||
console.log(`[auditor] PR #${pr.number} new head ${pr.head_sha.slice(0, 8)} (prior ${last.slice(0, 8)}, was ${prior_count}/${MAX_AUDITS_PER_PR}) — resetting cap counter`);
|
||||
state.audit_count_per_pr[prKey] = 0;
|
||||
}
|
||||
const auditedSoFar = state.audit_count_per_pr[prKey] ?? 0;
|
||||
if (MAX_AUDITS_PER_PR > 0 && auditedSoFar >= MAX_AUDITS_PER_PR) {
|
||||
// This branch only fires now if the SAME head SHA somehow
|
||||
// burned MAX audits (transient upstream errors retried that
|
||||
// many times). Operator can clear state.audit_count_per_pr.<N>
|
||||
// = 0 to force one more attempt; otherwise wait for next push.
|
||||
console.log(`[auditor] skip PR #${pr.number} (same head ${pr.head_sha.slice(0, 8)} burned ${auditedSoFar}/${MAX_AUDITS_PER_PR} — push new code or clear state.json audit_count_per_pr.${prKey})`);
|
||||
state.cycles_skipped_capped += 1;
|
||||
continue;
|
||||
}
|
||||
console.log(`[auditor] audit PR #${pr.number} (${pr.head_sha.slice(0, 8)}) — ${pr.title.slice(0, 60)} [${auditedSoFar + 1}/${MAX_AUDITS_PER_PR}]`);
|
||||
console.log(`[auditor] audit PR #${pr.number} (${pr.head_sha.slice(0, 8)}) — ${pr.title.slice(0, 60)}`);
|
||||
try {
|
||||
// Skip dynamic by default: it mutates live playbook state and
|
||||
// re-runs on every PR update would pollute quickly. Operator
|
||||
@ -152,22 +106,8 @@ async function runCycle(state: State): Promise<State> {
|
||||
skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
|
||||
});
|
||||
console.log(`[auditor] verdict=${verdict.overall} findings=${verdict.metrics.findings_total} (block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn})`);
|
||||
state.last_audited[prKey] = pr.head_sha;
|
||||
state.audit_count_per_pr[prKey] = auditedSoFar + 1;
|
||||
state.last_audited[String(pr.number)] = pr.head_sha;
|
||||
state.audits_run += 1;
|
||||
if (state.audit_count_per_pr[prKey] >= MAX_AUDITS_PER_PR) {
|
||||
console.log(`[auditor] PR #${pr.number} reached cap (${MAX_AUDITS_PER_PR} audits) — daemon will skip further audits until reset`);
|
||||
}
|
||||
// Persist state immediately after each successful audit so the
|
||||
// increment survives a crash. Pre-2026-04-27 the cycle saved
|
||||
// once at the end (main.ts:140), which lost the count if the
|
||||
// daemon was killed mid-cycle. Fix lifted from kimi_architect's
|
||||
// own audit on this very file. saveState is idempotent + cheap
|
||||
// (one JSON write), so per-audit cost is negligible.
|
||||
try { await saveState(state); }
|
||||
catch (e) {
|
||||
console.error(`[auditor] saveState mid-cycle failed: ${(e as Error).message} — count held in memory`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[auditor] audit failed: ${(e as Error).message}`);
|
||||
}
|
||||
|
||||
@ -1,161 +0,0 @@
|
||||
// kb_index — generic on-the-fly aggregation over append-only JSONL
|
||||
// scratchpads (audit_lessons, scrum_reviews, outcomes, observer ops).
|
||||
//
|
||||
// The mem0 insight: raw rows are CHEAP and tell the full story, but
|
||||
// downstream prompts need a DEFINITION, not a log. A definition is
|
||||
// the aggregate: "this signature has fired N times across M distinct
|
||||
// scopes, first_seen=X, last_seen=Y, confidence=M/N."
|
||||
//
|
||||
// This library is the single shared aggregator. Every KB writer keeps
|
||||
// appending raw rows; every KB reader uses aggregate() instead of
|
||||
// tailing the raw stream. No second file to sync, no ADD/UPDATE/NOOP
|
||||
// routing — the stats roll up from the raw rows every time.
|
||||
//
|
||||
// Why this works past hundreds of runs:
|
||||
// - aggregate() is bounded by distinct_signatures, not total_rows.
|
||||
// - confidence = distinct_scopes / count — low for same-scope noise,
|
||||
// high for cross-scope patterns. Downstream severity ramps on
|
||||
// confidence × count, not raw count, so one unfixed PR can't
|
||||
// inflate its own recurrence score (the classic mem0 failure).
|
||||
// - rotation (later) moves old raw to archive files; aggregate()
|
||||
// can still read both to compute lifetime counts when needed.
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
|
||||
export interface AggregateRow {
|
||||
signature: string;
|
||||
count: number;
|
||||
distinct_scopes: number;
|
||||
first_seen: string;
|
||||
last_seen: string;
|
||||
confidence: number; // distinct_scopes / count — capped at 1.0
|
||||
representative_summary: string; // most-recent summary for this signature
|
||||
scopes: string[]; // up to 20 most-recent scopes for debugging
|
||||
checks: string[]; // distinct `check` values (audit_lessons-specific)
|
||||
}
|
||||
|
||||
export interface AggregateOptions<T> {
|
||||
/** How to extract the dedup key from a row. */
|
||||
keyFn: (row: T) => string | undefined;
|
||||
/** How to extract the "scope" — distinct scopes count gives confidence. */
|
||||
scopeFn: (row: T) => string | undefined;
|
||||
/** How to extract the timestamp (defaults to row.audited_at / row.reviewed_at / row.timestamp). */
|
||||
timeFn?: (row: T) => string | undefined;
|
||||
/** How to extract a representative summary (defaults to row.summary). */
|
||||
summaryFn?: (row: T) => string | undefined;
|
||||
/** Max rows to read from the JSONL tail; 0 = read all. */
|
||||
tailLimit?: number;
|
||||
/** Include per-row check field (for multi-check aggregates). */
|
||||
checkFn?: (row: T) => string | undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a JSONL file and produce the aggregate map keyed by signature.
|
||||
* Safe on missing or malformed files — returns empty map.
|
||||
*/
|
||||
export async function aggregate<T = any>(
|
||||
jsonlPath: string,
|
||||
opts: AggregateOptions<T>,
|
||||
): Promise<Map<string, AggregateRow>> {
|
||||
const out = new Map<string, AggregateRow>();
|
||||
let raw: string;
|
||||
try { raw = await readFile(jsonlPath, "utf8"); } catch { return out; }
|
||||
const lines = raw.split("\n").filter(l => l.length > 0);
|
||||
const sliceFrom = opts.tailLimit && opts.tailLimit > 0 ? Math.max(0, lines.length - opts.tailLimit) : 0;
|
||||
|
||||
const timeFn = opts.timeFn ?? ((r: any) => r?.audited_at ?? r?.reviewed_at ?? r?.timestamp ?? r?.ran_at);
|
||||
const summaryFn = opts.summaryFn ?? ((r: any) => r?.summary ?? r?.representative_summary);
|
||||
|
||||
// Per-signature scope tracking — need counts by scope to compute
|
||||
// distinct_scopes without double-counting a scope that appears 50
|
||||
// times. Using a Set<scope> per signature.
|
||||
const scopeSets = new Map<string, Set<string>>();
|
||||
const checkSets = new Map<string, Set<string>>();
|
||||
|
||||
for (let i = sliceFrom; i < lines.length; i++) {
|
||||
let row: T;
|
||||
try { row = JSON.parse(lines[i]) as T; } catch { continue; }
|
||||
const sig = opts.keyFn(row);
|
||||
if (!sig) continue;
|
||||
|
||||
let agg = out.get(sig);
|
||||
if (!agg) {
|
||||
agg = {
|
||||
signature: sig,
|
||||
count: 0,
|
||||
distinct_scopes: 0,
|
||||
first_seen: "",
|
||||
last_seen: "",
|
||||
confidence: 0,
|
||||
representative_summary: "",
|
||||
scopes: [],
|
||||
checks: [],
|
||||
};
|
||||
out.set(sig, agg);
|
||||
scopeSets.set(sig, new Set<string>());
|
||||
checkSets.set(sig, new Set<string>());
|
||||
}
|
||||
|
||||
agg.count += 1;
|
||||
|
||||
const scope = opts.scopeFn(row);
|
||||
if (scope !== undefined && scope !== null && scope !== "") {
|
||||
scopeSets.get(sig)!.add(String(scope));
|
||||
// Keep scopes array ordered by recency (newest wins — shift
|
||||
// oldest when at cap).
|
||||
const arr = agg.scopes;
|
||||
const s = String(scope);
|
||||
const existing = arr.indexOf(s);
|
||||
if (existing >= 0) arr.splice(existing, 1);
|
||||
arr.push(s);
|
||||
if (arr.length > 20) arr.shift();
|
||||
}
|
||||
|
||||
if (opts.checkFn) {
|
||||
const c = opts.checkFn(row);
|
||||
if (c) checkSets.get(sig)!.add(String(c));
|
||||
}
|
||||
|
||||
const t = timeFn(row);
|
||||
if (t) {
|
||||
if (!agg.first_seen || t < agg.first_seen) agg.first_seen = t;
|
||||
if (!agg.last_seen || t > agg.last_seen) agg.last_seen = t;
|
||||
}
|
||||
|
||||
const s = summaryFn(row);
|
||||
if (s) agg.representative_summary = String(s);
|
||||
}
|
||||
|
||||
// Finalize derived fields.
|
||||
for (const [sig, agg] of out) {
|
||||
const scopes = scopeSets.get(sig) ?? new Set<string>();
|
||||
agg.distinct_scopes = scopes.size;
|
||||
agg.confidence = agg.count > 0 ? Math.min(1, agg.distinct_scopes / agg.count) : 0;
|
||||
const checks = checkSets.get(sig);
|
||||
if (checks) agg.checks = Array.from(checks).sort();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Severity policy derived from aggregate stats. The rating lives here
|
||||
* (not in each check) so all KB readers ramp severity consistently.
|
||||
*
|
||||
* - confidence × count product is the real signal.
|
||||
* - Low confidence (< 0.3) = same-scope noise → info regardless of count.
|
||||
* - Mid confidence (0.3-0.6) = mixed signal → warn at count ≥ 3.
|
||||
* - High confidence (> 0.6) with count ≥ 5 = block-worthy cross-cutting pattern.
|
||||
*
|
||||
* Callers can override by reading agg directly; this is the default
|
||||
* policy that matches the "don't escalate one unfixed PR" discipline.
|
||||
*/
|
||||
export function ratingSeverity(agg: AggregateRow): "info" | "warn" | "block" {
|
||||
if (agg.confidence >= 0.6 && agg.count >= 5) return "block";
|
||||
if (agg.confidence >= 0.3 && agg.count >= 3) return "warn";
|
||||
return "info";
|
||||
}
|
||||
|
||||
/** Human-friendly one-line summary of an aggregate row for finding evidence. */
|
||||
export function formatAgg(agg: AggregateRow): string {
|
||||
return `count=${agg.count} distinct_scopes=${agg.distinct_scopes} confidence=${agg.confidence.toFixed(2)} seen=[${agg.first_seen.slice(0, 10)}..${agg.last_seen.slice(0, 10)}]`;
|
||||
}
|
||||
@ -1,269 +0,0 @@
|
||||
// kb_stats — on-demand dashboard numbers from the KB scratchpad
|
||||
// files. Reads data/_auditor/verdicts/*, data/_kb/audit_lessons.jsonl,
|
||||
// data/_kb/audit_facts.jsonl, data/_kb/audit_discrepancies.jsonl,
|
||||
// data/_kb/scrum_reviews.jsonl and prints:
|
||||
//
|
||||
// - verdict flip-flop rate (same SHA re-audited, verdict changed?)
|
||||
// - consensus discrepancy rate (N runs disagreed on a claim)
|
||||
// - confidence distribution from kb_index aggregator
|
||||
// - top N recurring entities from audit_facts
|
||||
// - fact growth over time
|
||||
// - scrum vs inference KB split
|
||||
//
|
||||
// Run: bun run auditor/kb_stats.ts
|
||||
// bun run auditor/kb_stats.ts --top 15 # show top 15 entities
|
||||
// bun run auditor/kb_stats.ts --json # machine-readable
|
||||
//
|
||||
// This is the "dashboard" without running Grafana. If someone really
|
||||
// wants a dashboard, wire this output into a static HTML page + cron.
|
||||
|
||||
import { readFile, readdir } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { aggregate } from "./kb_index.ts";
|
||||
|
||||
const REPO = "/home/profit/lakehouse";
|
||||
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
|
||||
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
|
||||
const AUDIT_FACTS = `${REPO}/data/_kb/audit_facts.jsonl`;
|
||||
const AUDIT_DISCREPANCIES = `${REPO}/data/_kb/audit_discrepancies.jsonl`;
|
||||
const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
|
||||
|
||||
interface Args {
|
||||
top: number;
|
||||
json: boolean;
|
||||
}
|
||||
|
||||
function parseArgs(argv: string[]): Args {
|
||||
const a: Args = { top: 10, json: false };
|
||||
for (let i = 2; i < argv.length; i++) {
|
||||
if (argv[i] === "--top") a.top = Number(argv[++i] ?? 10);
|
||||
else if (argv[i] === "--json") a.json = true;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
async function readJsonl<T = any>(path: string): Promise<T[]> {
|
||||
try {
|
||||
const raw = await readFile(path, "utf8");
|
||||
return raw.split("\n").filter(l => l.length > 0).map(l => {
|
||||
try { return JSON.parse(l) as T; } catch { return null as any; }
|
||||
}).filter(r => r !== null);
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
async function loadVerdicts(): Promise<Array<{ pr: number; sha: string; overall: string; findings_total: number; findings_block: number; findings_warn: number }>> {
|
||||
let files: string[] = [];
|
||||
try { files = await readdir(VERDICTS_DIR); } catch { return []; }
|
||||
const out = [];
|
||||
for (const f of files) {
|
||||
if (!f.endsWith(".json")) continue;
|
||||
const m = f.match(/^(\d+)-([0-9a-f]+)\.json$/);
|
||||
if (!m) continue;
|
||||
try {
|
||||
const v = JSON.parse(await readFile(join(VERDICTS_DIR, f), "utf8"));
|
||||
out.push({
|
||||
pr: Number(m[1]),
|
||||
sha: m[2],
|
||||
overall: String(v.overall),
|
||||
findings_total: Number(v.metrics?.findings_total ?? 0),
|
||||
findings_block: Number(v.metrics?.findings_block ?? 0),
|
||||
findings_warn: Number(v.metrics?.findings_warn ?? 0),
|
||||
});
|
||||
} catch { /* skip corrupt */ }
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
interface Stats {
|
||||
audit_count: number;
|
||||
verdict_distribution: Record<string, number>;
|
||||
// Same PR with multiple SHAs — if verdicts differ, that's drift across
|
||||
// the PR's commit history. Not a flip-flop in the classical sense,
|
||||
// but worth surfacing (e.g. "PR #8 was block block req req block").
|
||||
per_pr_verdict_sequences: Record<number, string[]>;
|
||||
// For each PR with ≥ 2 audits, how many distinct verdicts did it
|
||||
// produce? 1 = stable; 2+ = some flipping.
|
||||
verdict_instability: { pr_count: number; pr_with_multiple_verdicts: number; pr_with_3plus: number };
|
||||
consensus: { discrepancy_count: number; tiebreaker_used: number; unresolved: number };
|
||||
kb: {
|
||||
audit_lessons_rows: number;
|
||||
audit_facts_rows: number;
|
||||
scrum_reviews_rows: number;
|
||||
distinct_finding_signatures: number;
|
||||
distinct_entities_across_prs: number;
|
||||
entities_in_2plus_prs: number;
|
||||
entities_in_5plus_prs: number;
|
||||
};
|
||||
fact_quality: {
|
||||
verifier_verdict_distribution: Record<string, number>;
|
||||
facts_dropped_by_verifier_total: number;
|
||||
extraction_success_rate: number;
|
||||
};
|
||||
top_entities: Array<{ name: string; distinct_prs: number; count: number; types: string[] }>;
|
||||
kb_by_source: Record<string, number>;
|
||||
}
|
||||
|
||||
async function collect(args: Args): Promise<Stats> {
|
||||
const verdicts = await loadVerdicts();
|
||||
const lessons = await readJsonl<any>(AUDIT_LESSONS);
|
||||
const facts = await readJsonl<any>(AUDIT_FACTS);
|
||||
const disc = await readJsonl<any>(AUDIT_DISCREPANCIES);
|
||||
const reviews = await readJsonl<any>(SCRUM_REVIEWS);
|
||||
|
||||
// Verdict stability
|
||||
const byPr: Record<number, string[]> = {};
|
||||
const verdictDist: Record<string, number> = {};
|
||||
for (const v of verdicts) {
|
||||
(byPr[v.pr] ??= []).push(v.overall);
|
||||
verdictDist[v.overall] = (verdictDist[v.overall] ?? 0) + 1;
|
||||
}
|
||||
let multi = 0, tri = 0;
|
||||
for (const [_, seq] of Object.entries(byPr)) {
|
||||
const distinct = new Set(seq);
|
||||
if (distinct.size >= 2) multi++;
|
||||
if (distinct.size >= 3) tri++;
|
||||
}
|
||||
|
||||
// Consensus drift
|
||||
const consensus = {
|
||||
discrepancy_count: disc.length,
|
||||
tiebreaker_used: disc.filter(d => String(d.resolution).startsWith("tiebreaker")).length,
|
||||
unresolved: disc.filter(d => d.resolution === "unresolved").length,
|
||||
};
|
||||
|
||||
// Lesson signatures
|
||||
const lessonAgg = await aggregate<any>(AUDIT_LESSONS, {
|
||||
keyFn: r => r?.signature,
|
||||
scopeFn: r => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||
});
|
||||
|
||||
// Entity aggregation across audit_facts rows
|
||||
interface EntAgg { distinct_prs: Set<number>; count: number; types: Set<string>; name: string; sources: Set<string> }
|
||||
const entAgg = new Map<string, EntAgg>();
|
||||
const sourceCount: Record<string, number> = {};
|
||||
let totalVerdictDist: Record<string, number> = { CORRECT: 0, INCORRECT: 0, UNVERIFIABLE: 0, UNCHECKED: 0 };
|
||||
let factsDroppedTotal = 0;
|
||||
let extractionsWithFacts = 0;
|
||||
|
||||
for (const row of facts) {
|
||||
const src = String(row.source ?? "unknown");
|
||||
sourceCount[src] = (sourceCount[src] ?? 0) + 1;
|
||||
const pr = Number(row.pr_number);
|
||||
if (Array.isArray(row.verifier_verdicts)) {
|
||||
for (const v of row.verifier_verdicts) {
|
||||
totalVerdictDist[v] = (totalVerdictDist[v] ?? 0) + 1;
|
||||
}
|
||||
}
|
||||
factsDroppedTotal += Number(row.facts_dropped_by_verifier ?? 0);
|
||||
if ((Array.isArray(row.facts) && row.facts.length > 0) || (Array.isArray(row.entities) && row.entities.length > 0)) {
|
||||
extractionsWithFacts++;
|
||||
}
|
||||
for (const e of Array.isArray(row.entities) ? row.entities : []) {
|
||||
const name = String(e?.name ?? "").trim();
|
||||
if (name.length < 3) continue;
|
||||
const key = name.toLowerCase();
|
||||
const agg = entAgg.get(key) ?? { distinct_prs: new Set(), count: 0, types: new Set(), name, sources: new Set() };
|
||||
agg.count++;
|
||||
if (Number.isFinite(pr) && pr > 0) agg.distinct_prs.add(pr);
|
||||
if (e?.type) agg.types.add(String(e.type));
|
||||
agg.sources.add(src);
|
||||
entAgg.set(key, agg);
|
||||
}
|
||||
}
|
||||
|
||||
const entitiesIn2Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 2).length;
|
||||
const entitiesIn5Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 5).length;
|
||||
const topEntities = Array.from(entAgg.values())
|
||||
.sort((a, b) => b.distinct_prs.size - a.distinct_prs.size || b.count - a.count)
|
||||
.slice(0, args.top)
|
||||
.map(a => ({
|
||||
name: a.name,
|
||||
distinct_prs: a.distinct_prs.size,
|
||||
count: a.count,
|
||||
types: Array.from(a.types),
|
||||
}));
|
||||
|
||||
const stats: Stats = {
|
||||
audit_count: verdicts.length,
|
||||
verdict_distribution: verdictDist,
|
||||
per_pr_verdict_sequences: byPr,
|
||||
verdict_instability: {
|
||||
pr_count: Object.keys(byPr).length,
|
||||
pr_with_multiple_verdicts: multi,
|
||||
pr_with_3plus: tri,
|
||||
},
|
||||
consensus,
|
||||
kb: {
|
||||
audit_lessons_rows: lessons.length,
|
||||
audit_facts_rows: facts.length,
|
||||
scrum_reviews_rows: reviews.length,
|
||||
distinct_finding_signatures: lessonAgg.size,
|
||||
distinct_entities_across_prs: entAgg.size,
|
||||
entities_in_2plus_prs: entitiesIn2Plus,
|
||||
entities_in_5plus_prs: entitiesIn5Plus,
|
||||
},
|
||||
fact_quality: {
|
||||
verifier_verdict_distribution: totalVerdictDist,
|
||||
facts_dropped_by_verifier_total: factsDroppedTotal,
|
||||
extraction_success_rate: facts.length > 0 ? extractionsWithFacts / facts.length : 0,
|
||||
},
|
||||
top_entities: topEntities,
|
||||
kb_by_source: sourceCount,
|
||||
};
|
||||
return stats;
|
||||
}
|
||||
|
||||
function renderHuman(s: Stats): string {
|
||||
const lines: string[] = [];
|
||||
lines.push("═══ KB STATS ═══");
|
||||
lines.push("");
|
||||
lines.push(`Audits: ${s.audit_count} total across ${s.verdict_instability.pr_count} distinct PRs`);
|
||||
lines.push(`Verdicts: ${Object.entries(s.verdict_distribution).map(([k, v]) => `${k}=${v}`).join(" ")}`);
|
||||
const multiplePct = s.verdict_instability.pr_count > 0
|
||||
? Math.round(100 * s.verdict_instability.pr_with_multiple_verdicts / s.verdict_instability.pr_count)
|
||||
: 0;
|
||||
lines.push(`Verdict instability: ${s.verdict_instability.pr_with_multiple_verdicts}/${s.verdict_instability.pr_count} PRs had 2+ distinct verdicts (${multiplePct}%) — 3+ distinct: ${s.verdict_instability.pr_with_3plus}`);
|
||||
lines.push("");
|
||||
lines.push("─── Consensus ───");
|
||||
lines.push(` discrepancies logged: ${s.consensus.discrepancy_count}`);
|
||||
lines.push(` tiebreaker used: ${s.consensus.tiebreaker_used}`);
|
||||
lines.push(` unresolved: ${s.consensus.unresolved}`);
|
||||
const dRate = s.audit_count > 0 ? (100 * s.consensus.discrepancy_count / s.audit_count).toFixed(1) : "0";
|
||||
lines.push(` discrepancy rate: ${dRate}% of audits`);
|
||||
lines.push("");
|
||||
lines.push("─── KB size ───");
|
||||
lines.push(` audit_lessons.jsonl: ${s.kb.audit_lessons_rows} rows, ${s.kb.distinct_finding_signatures} distinct signatures`);
|
||||
lines.push(` audit_facts.jsonl: ${s.kb.audit_facts_rows} rows, ${s.kb.distinct_entities_across_prs} distinct entities`);
|
||||
lines.push(` scrum_reviews.jsonl: ${s.kb.scrum_reviews_rows} rows`);
|
||||
lines.push(` entities in 2+ PRs: ${s.kb.entities_in_2plus_prs}`);
|
||||
lines.push(` entities in 5+ PRs: ${s.kb.entities_in_5plus_prs} ← strong cross-cutting signal`);
|
||||
lines.push("");
|
||||
lines.push("─── Fact quality ───");
|
||||
const v = s.fact_quality.verifier_verdict_distribution;
|
||||
lines.push(` verifier verdicts: CORRECT=${v.CORRECT ?? 0} UNVERIFIABLE=${v.UNVERIFIABLE ?? 0} UNCHECKED=${v.UNCHECKED ?? 0} INCORRECT=${v.INCORRECT ?? 0}`);
|
||||
lines.push(` facts dropped by verifier: ${s.fact_quality.facts_dropped_by_verifier_total}`);
|
||||
lines.push(` extraction success rate: ${(s.fact_quality.extraction_success_rate * 100).toFixed(1)}%`);
|
||||
lines.push("");
|
||||
lines.push("─── KB sources ───");
|
||||
for (const [src, n] of Object.entries(s.kb_by_source)) {
|
||||
lines.push(` ${src}: ${n}`);
|
||||
}
|
||||
lines.push("");
|
||||
lines.push(`─── Top ${s.top_entities.length} recurring entities ───`);
|
||||
for (const e of s.top_entities) {
|
||||
lines.push(` [${e.distinct_prs} PRs × ${e.count} obs] ${e.name} (${e.types.join(",")})`);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs(process.argv);
|
||||
const stats = await collect(args);
|
||||
if (args.json) {
|
||||
console.log(JSON.stringify(stats, (_, v) => v instanceof Set ? Array.from(v) : v, 2));
|
||||
} else {
|
||||
console.log(renderHuman(stats));
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[kb_stats] fatal:", e); process.exit(1); });
|
||||
@ -1,85 +0,0 @@
|
||||
// drift_report.ts — comparison of a current run summary vs the
|
||||
// previous run summary on disk. Spec calls this "drift detection";
|
||||
// concretely it answers: did the pipeline behave the same way as
|
||||
// last time, and if not, was the change explained by an input change
|
||||
// or did it appear out of nowhere (silent drift)?
|
||||
//
|
||||
// Severity:
|
||||
// ok — within 20% on every metric, no hash surprises
|
||||
// warn — record-count or category swing > 20%, OR new error class
|
||||
// alert — output_hash differs while input_hash is identical
|
||||
// (deterministic violation — same input → different output)
|
||||
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp,
|
||||
} from "./types";
|
||||
import type { StageName } from "./stage_receipt";
|
||||
|
||||
export const DRIFT_REPORT_SCHEMA_VERSION = 2;
|
||||
export const DRIFT_THRESHOLD_PCT = 0.20;
|
||||
|
||||
export type DriftSeverity = "ok" | "warn" | "alert";
|
||||
|
||||
export interface StageDrift {
|
||||
stage: StageName;
|
||||
delta_records_in: number; // current - prior
|
||||
delta_records_out: number;
|
||||
delta_accepted: number;
|
||||
delta_quarantined: number;
|
||||
pct_change_out: number | null; // null when prior had 0 records
|
||||
// null when input_hash isn't materialized into the stage summary —
|
||||
// schema v1 lied and reported `true` here. v2 is honest: callers
|
||||
// that want determinism enforcement must read the full StageReceipt
|
||||
// off disk and compute input_hash equality there.
|
||||
input_hash_match: boolean | null;
|
||||
output_hash_match: boolean;
|
||||
// alert if input_hash matches but output_hash diverges
|
||||
deterministic_violation: boolean;
|
||||
notes: string[];
|
||||
}
|
||||
|
||||
export interface DriftReport {
|
||||
schema_version: number;
|
||||
run_id: string;
|
||||
prior_run_id: string | null; // null when no prior run on disk
|
||||
generated_at: string;
|
||||
severity: DriftSeverity;
|
||||
stages: StageDrift[];
|
||||
// Top-level swings the human reader should see immediately.
|
||||
flags: string[];
|
||||
}
|
||||
|
||||
export function validateDriftReport(input: unknown): ValidationResult<DriftReport> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object"] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== DRIFT_REPORT_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${DRIFT_REPORT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.run_id, "run_id", errors) && ok;
|
||||
if (r.prior_run_id !== null && typeof r.prior_run_id !== "string") {
|
||||
errors.push("prior_run_id: must be string or null");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireIsoTimestamp(r.generated_at, "generated_at", errors) && ok;
|
||||
if (!["ok", "warn", "alert"].includes(r.severity as string)) {
|
||||
errors.push(`severity: must be ok|warn|alert, got ${JSON.stringify(r.severity)}`);
|
||||
ok = false;
|
||||
}
|
||||
if (!Array.isArray(r.stages)) {
|
||||
errors.push("stages: expected array");
|
||||
ok = false;
|
||||
}
|
||||
if (!Array.isArray(r.flags)) {
|
||||
errors.push("flags: expected array");
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as DriftReport };
|
||||
}
|
||||
@ -1,116 +0,0 @@
|
||||
// EvidenceRecord schema tests.
|
||||
//
|
||||
// Two positive fixtures (one per real-source prototype: distilled_facts
|
||||
// + contract_analyses) and three negative fixtures pinning the
|
||||
// non-negotiable invariants the spec demands:
|
||||
// - every record must trace to a source (provenance)
|
||||
// - schema_version must match — silent v1/v2 drift is the worst kind
|
||||
// - required identity fields (run_id) cannot be missing
|
||||
//
|
||||
// Run with: bun test auditor/schemas/distillation/evidence_record.test.ts
|
||||
|
||||
import { test, expect } from "bun:test";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { resolve } from "node:path";
|
||||
|
||||
import { validateEvidenceRecord, EVIDENCE_SCHEMA_VERSION } from "./evidence_record";
|
||||
|
||||
const FIXTURE_DIR = resolve(import.meta.dir, "fixtures");
|
||||
|
||||
function loadFixture(name: string): unknown {
|
||||
return JSON.parse(readFileSync(resolve(FIXTURE_DIR, name), "utf8"));
|
||||
}
|
||||
|
||||
test("EVIDENCE_SCHEMA_VERSION is 1 — bump deliberately, never silently", () => {
|
||||
expect(EVIDENCE_SCHEMA_VERSION).toBe(1);
|
||||
});
|
||||
|
||||
test("positive: distilled_fact materialized record validates", () => {
|
||||
const r = validateEvidenceRecord(loadFixture("evidence_positive_distilled_fact.json"));
|
||||
if (!r.valid) console.error("unexpected errors:", r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
if (r.valid) {
|
||||
expect(r.value.run_id).toBe("cae21289");
|
||||
expect(r.value.model_role).toBe("extractor");
|
||||
expect(r.value.provenance.source_file).toBe("data/_kb/distilled_facts.jsonl");
|
||||
}
|
||||
});
|
||||
|
||||
test("positive: contract_analysis materialized record validates with retrieval + observer fields", () => {
|
||||
const r = validateEvidenceRecord(loadFixture("evidence_positive_contract_analysis.json"));
|
||||
if (!r.valid) console.error("unexpected errors:", r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
if (r.valid) {
|
||||
expect(r.value.observer_verdict).toBe("reject");
|
||||
expect(r.value.observer_confidence).toBe(95);
|
||||
expect(r.value.retrieved_context?.matrix_corpora?.length).toBe(4);
|
||||
expect(r.value.failure_markers).toContain("observer_rejected");
|
||||
}
|
||||
});
|
||||
|
||||
test("negative: missing run_id is rejected with a specific error", () => {
|
||||
const r = validateEvidenceRecord(loadFixture("evidence_negative_no_run_id.json"));
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
expect(r.errors.some(e => e.includes("run_id"))).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("negative: schema_version mismatch is rejected (silent v1/v2 drift guard)", () => {
|
||||
const r = validateEvidenceRecord(loadFixture("evidence_negative_bad_schema_version.json"));
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
expect(r.errors.some(e => e.includes("schema_version"))).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("negative: bad provenance (non-sha256 sig_hash, non-ISO timestamp) is rejected", () => {
|
||||
const r = validateEvidenceRecord(loadFixture("evidence_negative_bad_provenance.json"));
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
// Must catch BOTH the sig_hash AND the recorded_at — comprehensive
|
||||
// error reporting is part of the contract.
|
||||
expect(r.errors.some(e => e.includes("sig_hash"))).toBe(true);
|
||||
expect(r.errors.some(e => e.includes("recorded_at"))).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("negative: non-object input is rejected with clear error", () => {
|
||||
const r = validateEvidenceRecord("not an object");
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
expect(r.errors[0]).toContain("expected object");
|
||||
}
|
||||
});
|
||||
|
||||
test("negative: human_override with invalid decision is rejected", () => {
|
||||
const fixture = loadFixture("evidence_positive_distilled_fact.json") as Record<string, unknown>;
|
||||
fixture.human_override = {
|
||||
overrider: "test-user",
|
||||
decision: "maybe", // invalid — must be accept|reject|needs_review
|
||||
reason: "test",
|
||||
overridden_at: "2026-04-26T22:30:00.000Z",
|
||||
};
|
||||
const r = validateEvidenceRecord(fixture);
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
expect(r.errors.some(e => e.includes("human_override.decision"))).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("positive: human_override = null is allowed (explicitly no override)", () => {
|
||||
const fixture = loadFixture("evidence_positive_distilled_fact.json") as Record<string, unknown>;
|
||||
fixture.human_override = null;
|
||||
const r = validateEvidenceRecord(fixture);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("negative: observer_confidence outside [0, 100] is rejected", () => {
|
||||
const fixture = loadFixture("evidence_positive_contract_analysis.json") as Record<string, unknown>;
|
||||
fixture.observer_confidence = 150;
|
||||
const r = validateEvidenceRecord(fixture);
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) {
|
||||
expect(r.errors.some(e => e.includes("observer_confidence"))).toBe(true);
|
||||
}
|
||||
});
|
||||
@ -1,202 +0,0 @@
|
||||
// EvidenceRecord — the unified per-execution-trace record that the
|
||||
// Evidence View emits and the Success Scorer reads.
|
||||
//
|
||||
// Derived from now.md spec + reconciliation of two existing prototypes:
|
||||
// - distilled_facts.jsonl / distilled_procedures.jsonl (LLM-extracted
|
||||
// text with run_id + sig_hash + extractor + verifier + embedding)
|
||||
// - contract_analyses.jsonl (observer integration + retrieval
|
||||
// telemetry + cost + duration)
|
||||
//
|
||||
// Required fields are the ones every record MUST have for traceability:
|
||||
// run_id, task_id, timestamp, schema_version, provenance. Everything
|
||||
// else is typed-but-optional because no single source has all of them
|
||||
// — the Evidence View materializes them by JOINing across streams when
|
||||
// the source data is present.
|
||||
//
|
||||
// schema_version starts at 1 and gets bumped on breaking changes.
|
||||
// Validators MUST check schema_version and refuse unknown values so a
|
||||
// future v2 reader doesn't silently accept v1 records (or vice versa).
|
||||
|
||||
import {
|
||||
ValidationResult, Provenance,
|
||||
requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const EVIDENCE_SCHEMA_VERSION = 1;
|
||||
|
||||
export type ModelRole =
|
||||
| "executor" // produced the answer (e.g. scrum reviewer, mode runner LLM call)
|
||||
| "reviewer" // judged an executor output (e.g. observer, hand-review)
|
||||
| "extractor" // pulled structured data from text (e.g. fact_extractor)
|
||||
| "verifier" // confirmed/rejected an extracted claim (verifier in distilled_*)
|
||||
| "categorizer" // assigned a category (categorizer in distilled_*)
|
||||
| "tiebreaker" // resolved a consensus split
|
||||
| "applier" // landed code (scrum_applier)
|
||||
| "embedder" // produced embeddings
|
||||
| "other";
|
||||
|
||||
export interface EvidenceRecord {
|
||||
// ── Identity ──
|
||||
// run_id ties this record to a specific execution. Sources use it
|
||||
// inconsistently (some stream-level, some per-call). The Evidence
|
||||
// View canonicalizes to per-call; if the source is stream-level,
|
||||
// synthesize as `${stream_run_id}:${row_index}`.
|
||||
run_id: string;
|
||||
|
||||
// task_id groups records by logical task (e.g. one PR = one task_id
|
||||
// across multiple per-call runs). Defaults to run_id when no group
|
||||
// exists — never null.
|
||||
task_id: string;
|
||||
|
||||
// ISO 8601 of when the EXECUTION happened, not when this record was
|
||||
// materialized. Use the source row's timestamp; provenance carries
|
||||
// the materialization time separately.
|
||||
timestamp: string;
|
||||
|
||||
schema_version: number;
|
||||
|
||||
// ── Provenance ── (required — no record without source linkage)
|
||||
provenance: Provenance;
|
||||
|
||||
// ── Model attribution (optional) ──
|
||||
model_name?: string; // e.g. "kimi-k2:1t", "gpt-oss:120b"
|
||||
model_provider?: string; // e.g. "ollama_cloud", "openrouter", "ollama"
|
||||
model_role?: ModelRole;
|
||||
|
||||
// ── Content hashes (optional) ──
|
||||
// sha256 of the full input prompt and full output content. Pre-
|
||||
// computed so the Evidence Index can dedup across re-runs of the
|
||||
// same prompt without re-hashing.
|
||||
input_hash?: string;
|
||||
output_hash?: string;
|
||||
|
||||
// ── Repo + execution context ──
|
||||
source_files?: string[]; // files the run touched/read
|
||||
commands_run?: string[]; // shell commands or tool calls fired
|
||||
retrieved_context?: { // what the model saw via retrieval
|
||||
matrix_corpora?: string[];
|
||||
matrix_hits?: number;
|
||||
matrix_chunks_kept?: number;
|
||||
matrix_chunks_dropped?: number;
|
||||
pathway_fingerprints_seen?: number;
|
||||
};
|
||||
|
||||
// ── Observer + scratchpad ──
|
||||
observer_notes?: string[]; // observer.review() free-form notes
|
||||
observer_verdict?: "accept" | "reject" | "cycle" | string;
|
||||
observer_confidence?: number; // 0-100
|
||||
scratchpad_summary?: string; // tree-split scratchpad text or hash ref
|
||||
|
||||
// ── Outcome markers ──
|
||||
// Both arrays exist because a run can have multiple succeeded gates
|
||||
// AND multiple failed gates simultaneously. Empty arrays are valid;
|
||||
// missing arrays are also valid (means "no evidence either way").
|
||||
success_markers?: string[]; // e.g. "cargo_green", "tests_passed", "anchor_grounded"
|
||||
failure_markers?: string[]; // e.g. "warning_count_up", "rationale_mismatch", "consensus_split"
|
||||
|
||||
// ── Validation telemetry ──
|
||||
validation_results?: {
|
||||
grounded_fraction?: number; // mode_compare grounding %
|
||||
schema_valid?: boolean;
|
||||
pathway_replay_succeeded?: boolean;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
// ── Human-in-loop ──
|
||||
human_override?: {
|
||||
overrider: string; // user identifier
|
||||
decision: "accept" | "reject" | "needs_review";
|
||||
reason: string;
|
||||
overridden_at: string; // ISO 8601
|
||||
} | null;
|
||||
|
||||
// ── Performance ──
|
||||
cost_usd?: number;
|
||||
latency_ms?: number;
|
||||
prompt_tokens?: number;
|
||||
completion_tokens?: number;
|
||||
|
||||
// ── Free-form text content (the actual run output) ──
|
||||
// Optional because some sources are pure metadata (auto_apply.jsonl)
|
||||
// and have no text payload. Present for distilled_*, contract_analyses,
|
||||
// mode_experiments, scrum_reviews etc.
|
||||
text?: string;
|
||||
|
||||
// ── Domain-specific metadata bucket ──
|
||||
// Source-specific fields that don't earn a top-level slot. e.g.
|
||||
// contract_analyses rows carry `contractor` here; mode_experiments
|
||||
// could carry `corpus_set`. Typed scalar values only — keep this
|
||||
// small or it becomes a junk drawer. Added 2026-04-27 (Kimi audit
|
||||
// flagged `(ev as any).contractor` schema bypass at export_sft.ts:126).
|
||||
metadata?: Record<string, string | number | boolean>;
|
||||
}
|
||||
|
||||
export function validateEvidenceRecord(input: unknown): ValidationResult<EvidenceRecord> {
|
||||
const errors: string[] = [];
|
||||
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object, got " + (input === null ? "null" : typeof input)] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
|
||||
// Required
|
||||
let ok = true;
|
||||
ok = requireString(r.run_id, "run_id", errors) && ok;
|
||||
ok = requireString(r.task_id, "task_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.timestamp, "timestamp", errors) && ok;
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
if (r.schema_version !== EVIDENCE_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${EVIDENCE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
// Optional but typed-when-present
|
||||
if (r.model_role !== undefined) {
|
||||
const valid: ModelRole[] = ["executor", "reviewer", "extractor", "verifier", "categorizer", "tiebreaker", "applier", "embedder", "other"];
|
||||
if (!valid.includes(r.model_role as ModelRole)) {
|
||||
errors.push(`model_role: must be one of ${valid.join("|")}, got ${JSON.stringify(r.model_role)}`);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (r.input_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.input_hash))) {
|
||||
errors.push("input_hash: must be hex sha256 when present");
|
||||
ok = false;
|
||||
}
|
||||
if (r.output_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.output_hash))) {
|
||||
errors.push("output_hash: must be hex sha256 when present");
|
||||
ok = false;
|
||||
}
|
||||
if (r.source_files !== undefined && !requireStringArray(r.source_files, "source_files", errors)) ok = false;
|
||||
if (r.commands_run !== undefined && !requireStringArray(r.commands_run, "commands_run", errors)) ok = false;
|
||||
if (r.success_markers !== undefined && !requireStringArray(r.success_markers, "success_markers", errors)) ok = false;
|
||||
if (r.failure_markers !== undefined && !requireStringArray(r.failure_markers, "failure_markers", errors)) ok = false;
|
||||
if (r.observer_notes !== undefined && !requireStringArray(r.observer_notes, "observer_notes", errors)) ok = false;
|
||||
|
||||
if (r.observer_confidence !== undefined) {
|
||||
if (!requireNumber(r.observer_confidence, "observer_confidence", errors)) ok = false;
|
||||
else if ((r.observer_confidence as number) < 0 || (r.observer_confidence as number) > 100) {
|
||||
errors.push("observer_confidence: must be in [0, 100]");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (r.human_override !== undefined && r.human_override !== null) {
|
||||
const ho = r.human_override as Record<string, unknown>;
|
||||
if (typeof ho !== "object") {
|
||||
errors.push("human_override: expected object or null");
|
||||
ok = false;
|
||||
} else {
|
||||
ok = requireString(ho.overrider, "human_override.overrider", errors) && ok;
|
||||
ok = requireString(ho.reason, "human_override.reason", errors) && ok;
|
||||
ok = requireIsoTimestamp(ho.overridden_at, "human_override.overridden_at", errors) && ok;
|
||||
if (!["accept", "reject", "needs_review"].includes(ho.decision as string)) {
|
||||
errors.push(`human_override.decision: must be accept|reject|needs_review`);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as EvidenceRecord };
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
{
|
||||
"run_id": "cae21289",
|
||||
"task_id": "team_runs:637",
|
||||
"timestamp": "2026-04-23T09:54:40.729599Z",
|
||||
"schema_version": 1,
|
||||
"provenance": {
|
||||
"source_file": "data/_kb/distilled_facts.jsonl",
|
||||
"sig_hash": "not-a-real-sha256",
|
||||
"recorded_at": "yesterday"
|
||||
}
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
{
|
||||
"run_id": "cae21289",
|
||||
"task_id": "team_runs:637",
|
||||
"timestamp": "2026-04-23T09:54:40.729599Z",
|
||||
"schema_version": 99,
|
||||
"provenance": {
|
||||
"source_file": "data/_kb/distilled_facts.jsonl",
|
||||
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
|
||||
"recorded_at": "2026-04-26T22:30:00.000Z"
|
||||
}
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
{
|
||||
"task_id": "team_runs:637",
|
||||
"timestamp": "2026-04-23T09:54:40.729599Z",
|
||||
"schema_version": 1,
|
||||
"provenance": {
|
||||
"source_file": "data/_kb/distilled_facts.jsonl",
|
||||
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
|
||||
"recorded_at": "2026-04-26T22:30:00.000Z"
|
||||
},
|
||||
"text": "missing run_id should fail validation"
|
||||
}
|
||||
@ -1,27 +0,0 @@
|
||||
{
|
||||
"run_id": "contract_analysis:101078392:1777250758717",
|
||||
"task_id": "permit:101078392",
|
||||
"timestamp": "2026-04-25T23:45:58.717Z",
|
||||
"schema_version": 1,
|
||||
"provenance": {
|
||||
"source_file": "data/_kb/contract_analyses.jsonl",
|
||||
"line_offset": 0,
|
||||
"sig_hash": "f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1",
|
||||
"recorded_at": "2026-04-26T22:30:00.000Z"
|
||||
},
|
||||
"model_name": "kimi-k2:1t",
|
||||
"model_role": "executor",
|
||||
"model_provider": "ollama_cloud",
|
||||
"retrieved_context": {
|
||||
"matrix_corpora": ["entity_brief_v1", "chicago_permits_v1", "distilled_procedural_v20260423102847", "sec_tickers_v1"],
|
||||
"matrix_hits": 10
|
||||
},
|
||||
"observer_notes": ["contractor history shows 0 prior fills in Chicago downtown zone"],
|
||||
"observer_verdict": "reject",
|
||||
"observer_confidence": 95,
|
||||
"success_markers": ["matrix_hits_above_threshold"],
|
||||
"failure_markers": ["observer_rejected"],
|
||||
"cost_usd": 0.0002,
|
||||
"latency_ms": 25419,
|
||||
"text": "Permit 101078392 contractor ANTHONY FIORE — analysis: insufficient prior performance signal; recommend escalation."
|
||||
}
|
||||
@ -1,19 +0,0 @@
|
||||
{
|
||||
"run_id": "cae21289",
|
||||
"task_id": "team_runs:637",
|
||||
"timestamp": "2026-04-23T09:54:40.729599Z",
|
||||
"schema_version": 1,
|
||||
"provenance": {
|
||||
"source_file": "data/_kb/distilled_facts.jsonl",
|
||||
"line_offset": 0,
|
||||
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
|
||||
"recorded_at": "2026-04-26T22:30:00.000Z"
|
||||
},
|
||||
"model_name": "qwen2.5:latest",
|
||||
"model_role": "extractor",
|
||||
"model_provider": "ollama",
|
||||
"text": "Convergence refers to the system stabilizing into a state of high performance with low variance across iterations.",
|
||||
"validation_results": {
|
||||
"schema_valid": true
|
||||
}
|
||||
}
|
||||
@ -1,56 +0,0 @@
|
||||
// ModelLedgerEntry — aggregate per-task-type-per-model performance.
|
||||
// Built by aggregating mode_experiments.jsonl + model_trust.jsonl.
|
||||
// Updated rather than appended — one row per (model_name, task_type)
|
||||
// representing latest aggregates.
|
||||
import {
|
||||
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const MODEL_LEDGER_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface ModelLedgerEntry {
|
||||
schema_version: number;
|
||||
model_name: string;
|
||||
model_provider: string;
|
||||
task_type: string;
|
||||
success_rate: number; // [0, 1]
|
||||
failure_modes: string[]; // top failure mode tags
|
||||
best_partner_model?: string; // pairs well with X (consensus / tie-break)
|
||||
escalation_role?: string; // when this model gets escalated TO (or FROM)
|
||||
cost_usd_p50?: number;
|
||||
latency_ms_p50?: number;
|
||||
latency_ms_p95?: number;
|
||||
context_window?: number;
|
||||
sample_count: number;
|
||||
last_updated: string; // ISO 8601
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
export function validateModelLedgerEntry(input: unknown): ValidationResult<ModelLedgerEntry> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== MODEL_LEDGER_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${MODEL_LEDGER_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.model_name, "model_name", errors) && ok;
|
||||
ok = requireString(r.model_provider, "model_provider", errors) && ok;
|
||||
ok = requireString(r.task_type, "task_type", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.last_updated, "last_updated", errors) && ok;
|
||||
ok = requireStringArray(r.failure_modes, "failure_modes", errors) && ok;
|
||||
|
||||
if (!requireNumber(r.success_rate, "success_rate", errors)) ok = false;
|
||||
else if ((r.success_rate as number) < 0 || (r.success_rate as number) > 1) {
|
||||
errors.push("success_rate: must be in [0, 1]"); ok = false;
|
||||
}
|
||||
if (!requireNumber(r.sample_count, "sample_count", errors)) ok = false;
|
||||
else if ((r.sample_count as number) < 1 || !Number.isInteger(r.sample_count)) {
|
||||
errors.push("sample_count: must be positive integer (no aggregate from zero samples)"); ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as ModelLedgerEntry };
|
||||
}
|
||||
@ -1,68 +0,0 @@
|
||||
// Playbook — procedural knowledge extracted from accepted/partially-
|
||||
// accepted runs. Different from pathway_memory's bug_fingerprints (which
|
||||
// are pattern-detectors) — playbooks describe HOW to handle a task type.
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const PLAYBOOK_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface Playbook {
|
||||
schema_version: number;
|
||||
playbook_id: string;
|
||||
task_type: string; // e.g. "scrum_review", "pr_audit", "staffing.fill"
|
||||
problem_pattern: string; // when does this playbook apply?
|
||||
useful_context: string[]; // what to retrieve before running
|
||||
model_routing_path: string[]; // ordered model attempts that worked
|
||||
commands_worked: string[];
|
||||
commands_failed: string[];
|
||||
validation_steps: string[];
|
||||
repo_files_touched: string[];
|
||||
recovery_strategy: string; // what to do when the path fails
|
||||
known_failure_modes: string[];
|
||||
escalation_threshold: string; // when to switch to a stronger model
|
||||
acceptance_criteria: string[]; // how to know it succeeded
|
||||
source_run_ids: string[]; // FK to EvidenceRecord.run_id (provenance — every playbook traces to source)
|
||||
created_at: string;
|
||||
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
|
||||
}
|
||||
|
||||
export function validatePlaybook(input: unknown): ValidationResult<Playbook> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== PLAYBOOK_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${PLAYBOOK_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.playbook_id, "playbook_id", errors) && ok;
|
||||
ok = requireString(r.task_type, "task_type", errors) && ok;
|
||||
ok = requireString(r.problem_pattern, "problem_pattern", errors) && ok;
|
||||
ok = requireString(r.recovery_strategy, "recovery_strategy", errors) && ok;
|
||||
ok = requireString(r.escalation_threshold, "escalation_threshold", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
|
||||
ok = requireStringArray(r.useful_context, "useful_context", errors) && ok;
|
||||
ok = requireStringArray(r.model_routing_path, "model_routing_path", errors) && ok;
|
||||
ok = requireStringArray(r.commands_worked, "commands_worked", errors) && ok;
|
||||
ok = requireStringArray(r.commands_failed, "commands_failed", errors) && ok;
|
||||
ok = requireStringArray(r.validation_steps, "validation_steps", errors) && ok;
|
||||
ok = requireStringArray(r.repo_files_touched, "repo_files_touched", errors) && ok;
|
||||
ok = requireStringArray(r.known_failure_modes, "known_failure_modes", errors) && ok;
|
||||
ok = requireStringArray(r.acceptance_criteria, "acceptance_criteria", errors) && ok;
|
||||
ok = requireStringArray(r.source_run_ids, "source_run_ids", errors) && ok;
|
||||
|
||||
if (Array.isArray(r.source_run_ids) && r.source_run_ids.length === 0) {
|
||||
errors.push("source_run_ids: must be non-empty — every playbook traces to source evidence (spec non-negotiable)");
|
||||
ok = false;
|
||||
}
|
||||
if (Array.isArray(r.acceptance_criteria) && r.acceptance_criteria.length === 0) {
|
||||
errors.push("acceptance_criteria: must be non-empty — every playbook needs success criteria (spec non-negotiable)");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as Playbook };
|
||||
}
|
||||
@ -1,60 +0,0 @@
|
||||
// PreferenceSample — entry in exports/preference/chosen_rejected.jsonl.
|
||||
// Source: real disagreements (audit_discrepancies, scrum ladder retries).
|
||||
// Validator pins: chosen != rejected, both source_run_ids present, reason
|
||||
// is non-empty. No synthesized preferences.
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp, requireProvenance,
|
||||
} from "./types";
|
||||
|
||||
export const PREFERENCE_SAMPLE_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface PreferenceSample {
|
||||
schema_version: number;
|
||||
id: string;
|
||||
prompt: string;
|
||||
chosen: string;
|
||||
rejected: string;
|
||||
reason: string; // why chosen > rejected — must be non-empty
|
||||
chosen_run_id: string;
|
||||
rejected_run_id: string;
|
||||
created_at: string;
|
||||
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
|
||||
}
|
||||
|
||||
export function validatePreferenceSample(input: unknown): ValidationResult<PreferenceSample> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== PREFERENCE_SAMPLE_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${PREFERENCE_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.id, "id", errors) && ok;
|
||||
ok = requireString(r.prompt, "prompt", errors) && ok;
|
||||
ok = requireString(r.chosen, "chosen", errors) && ok;
|
||||
ok = requireString(r.rejected, "rejected", errors) && ok;
|
||||
ok = requireString(r.reason, "reason", errors) && ok;
|
||||
ok = requireString(r.chosen_run_id, "chosen_run_id", errors) && ok;
|
||||
ok = requireString(r.rejected_run_id, "rejected_run_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
// Self-pairing guard.
|
||||
if (r.chosen === r.rejected && typeof r.chosen === "string") {
|
||||
errors.push("chosen and rejected must differ — preference data needs a real disagreement");
|
||||
ok = false;
|
||||
}
|
||||
if (r.chosen_run_id === r.rejected_run_id && typeof r.chosen_run_id === "string") {
|
||||
errors.push("chosen_run_id and rejected_run_id must differ — same run can't disagree with itself");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.reason === "string" && (r.reason as string).trim().length === 0) {
|
||||
errors.push("reason: must be non-whitespace (every preference needs WHY chosen > rejected)");
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as PreferenceSample };
|
||||
}
|
||||
@ -1,72 +0,0 @@
|
||||
// RagSample — entry in exports/rag/playbooks.jsonl. Spec shape exactly,
|
||||
// plus provenance + success_score (so the index can re-rank by quality).
|
||||
import {
|
||||
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const RAG_SAMPLE_SCHEMA_VERSION = 1;
|
||||
|
||||
// Allowed source_category values. RAG accepts accepted/partial freely;
|
||||
// needs_human_review is opt-in (must be tagged so consumers can filter
|
||||
// it out for SFT).
|
||||
export const RAG_ALLOWED_CATEGORIES = ["accepted", "partially_accepted", "needs_human_review"] as const;
|
||||
export type RagSourceCategory = (typeof RAG_ALLOWED_CATEGORIES)[number];
|
||||
|
||||
export interface RagSample {
|
||||
schema_version: number;
|
||||
id: string;
|
||||
title: string;
|
||||
content: string;
|
||||
tags: string[];
|
||||
source_run_id: string;
|
||||
// Snapshot of the score the source carried at export time. Lets a
|
||||
// consumer see "this was partial" without re-reading scored-runs.
|
||||
success_score: RagSourceCategory;
|
||||
// Same value as success_score by spec (now.md asks for both fields).
|
||||
// Kept distinct so future schemas can diverge them (e.g. an
|
||||
// "is_review_material" flag) without breaking old consumers.
|
||||
source_category: RagSourceCategory;
|
||||
embedding_text: string; // the text to embed (often == content but can be shorter)
|
||||
created_at: string;
|
||||
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
|
||||
}
|
||||
|
||||
export function validateRagSample(input: unknown): ValidationResult<RagSample> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== RAG_SAMPLE_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${RAG_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.id, "id", errors) && ok;
|
||||
ok = requireString(r.title, "title", errors) && ok;
|
||||
ok = requireString(r.content, "content", errors) && ok;
|
||||
ok = requireString(r.embedding_text, "embedding_text", errors) && ok;
|
||||
ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
|
||||
ok = requireStringArray(r.tags, "tags", errors) && ok;
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
if (!RAG_ALLOWED_CATEGORIES.includes(r.success_score as RagSourceCategory)) {
|
||||
errors.push(`success_score: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")} (rejected never enters RAG)`);
|
||||
ok = false;
|
||||
}
|
||||
if (!RAG_ALLOWED_CATEGORIES.includes(r.source_category as RagSourceCategory)) {
|
||||
errors.push(`source_category: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")}`);
|
||||
ok = false;
|
||||
}
|
||||
if (r.success_score !== r.source_category) {
|
||||
errors.push("success_score and source_category must match (mirrored fields per spec)");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.content === "string" && (r.content as string).trim().length === 0) {
|
||||
errors.push("content: must be non-whitespace");
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as RagSample };
|
||||
}
|
||||
@ -1,286 +0,0 @@
|
||||
// Real-data validation test — proves the EvidenceRecord schema fits
|
||||
// what we ALREADY produce, with the minimum transformation each source
|
||||
// stream requires. Doubles as the stale-extraction probe: if
|
||||
// distilled_facts.jsonl rows can't materialize, we know that stream
|
||||
// has rotted and Phase 2 sources from elsewhere.
|
||||
//
|
||||
// Strategy:
|
||||
// 1. Read first N rows from each source jsonl (skip if missing)
|
||||
// 2. Apply minimal transformer: add schema_version + provenance,
|
||||
// synthesize run_id/task_id when source doesn't carry them
|
||||
// 3. Validate each materialized record
|
||||
// 4. Tally pass/fail per source + collect failure reasons
|
||||
//
|
||||
// This file is allowed to skip when source files don't exist (fresh
|
||||
// clone), so it acts as both a CI guard and a real-environment probe.
|
||||
|
||||
import { test, expect } from "bun:test";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
import { resolve } from "node:path";
|
||||
|
||||
import {
|
||||
validateEvidenceRecord, EVIDENCE_SCHEMA_VERSION, EvidenceRecord, ModelRole,
|
||||
} from "./evidence_record";
|
||||
|
||||
const ROOT = "/home/profit/lakehouse";
|
||||
const SAMPLE_PER_SOURCE = 10;
|
||||
|
||||
interface SourceProbe {
|
||||
source_file: string;
|
||||
transform: (row: any, lineNo: number) => Partial<EvidenceRecord> | null;
|
||||
}
|
||||
|
||||
// Canonical 64-char synthetic sha256 for tests where the source row
|
||||
// lacks one. Pretends the materializer would compute it via
|
||||
// canonicalSha256(orderedKeys(row)) at Phase 2 time. We use a fixed
|
||||
// value here to keep the test deterministic; real materialization
|
||||
// re-hashes per row.
|
||||
const PLACEHOLDER_SHA = "0000000000000000000000000000000000000000000000000000000000000000";
|
||||
const RECORDED = "2026-04-26T22:30:00.000Z";
|
||||
|
||||
function provFor(source_file: string, lineNo: number, sigHashRaw?: string): EvidenceRecord["provenance"] {
|
||||
// Pad shorter hashes (distilled_* uses 16-char) to 64 — mimics
|
||||
// canonical recompute.
|
||||
const sig = sigHashRaw && /^[0-9a-f]+$/.test(sigHashRaw)
|
||||
? sigHashRaw.padEnd(64, "0").slice(0, 64)
|
||||
: PLACEHOLDER_SHA;
|
||||
return {
|
||||
source_file: source_file.replace(`${ROOT}/`, ""),
|
||||
line_offset: lineNo,
|
||||
sig_hash: sig,
|
||||
recorded_at: RECORDED,
|
||||
};
|
||||
}
|
||||
|
||||
const PROBES: SourceProbe[] = [
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/distilled_facts.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: String(row.run_id ?? `distilled_facts:${lineNo}`),
|
||||
task_id: String(row.source_label ?? `distilled_facts:${lineNo}`),
|
||||
timestamp: row.created_at,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/distilled_facts.jsonl`, lineNo, row.sig_hash),
|
||||
model_name: row.extractor,
|
||||
model_role: "extractor" as ModelRole,
|
||||
model_provider: "ollama",
|
||||
text: row.text,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/distilled_procedures.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: String(row.run_id ?? `distilled_procedures:${lineNo}`),
|
||||
task_id: String(row.source_label ?? `distilled_procedures:${lineNo}`),
|
||||
timestamp: row.created_at,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/distilled_procedures.jsonl`, lineNo, row.sig_hash),
|
||||
model_name: row.extractor,
|
||||
model_role: "extractor" as ModelRole,
|
||||
model_provider: "ollama",
|
||||
text: row.text,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/contract_analyses.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: `contract_analysis:${row.permit_id}:${new Date(row.ts).getTime()}`,
|
||||
task_id: `permit:${row.permit_id}`,
|
||||
timestamp: row.ts,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/contract_analyses.jsonl`, lineNo),
|
||||
model_role: "executor" as ModelRole,
|
||||
retrieved_context: {
|
||||
matrix_corpora: Object.keys(row.matrix_corpora ?? {}),
|
||||
matrix_hits: row.matrix_hits,
|
||||
},
|
||||
observer_notes: row.observer_notes ? [row.observer_notes].flat() : undefined,
|
||||
observer_verdict: row.observer_verdict,
|
||||
observer_confidence: row.observer_conf,
|
||||
success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined,
|
||||
failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined,
|
||||
cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined,
|
||||
latency_ms: row.duration_ms,
|
||||
text: row.analysis,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/mode_experiments.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: `mode_exec:${new Date(row.ts).getTime()}:${row.file_path ?? "?"}`,
|
||||
task_id: row.task_class,
|
||||
timestamp: row.ts,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/mode_experiments.jsonl`, lineNo),
|
||||
model_name: row.model,
|
||||
model_role: "executor" as ModelRole,
|
||||
model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud",
|
||||
retrieved_context: {
|
||||
matrix_corpora: row.sources?.matrix_corpus,
|
||||
matrix_chunks_kept: row.sources?.matrix_chunks_kept,
|
||||
matrix_chunks_dropped: row.sources?.matrix_chunks_dropped,
|
||||
pathway_fingerprints_seen: row.sources?.bug_fingerprints_count,
|
||||
},
|
||||
latency_ms: row.latency_ms,
|
||||
text: row.response,
|
||||
source_files: row.file_path ? [row.file_path] : undefined,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/scrum_reviews.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: `scrum:${new Date(row.reviewed_at).getTime()}:${row.file}`,
|
||||
task_id: `scrum_review:${row.file}`,
|
||||
timestamp: row.reviewed_at,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/scrum_reviews.jsonl`, lineNo),
|
||||
model_name: row.accepted_model,
|
||||
model_role: "executor" as ModelRole,
|
||||
source_files: [row.file],
|
||||
success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined,
|
||||
text: row.suggestions_preview,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/observer_escalations.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: `obs_esc:${new Date(row.ts).getTime()}:${row.sig_hash}`,
|
||||
task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`,
|
||||
timestamp: row.ts,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/observer_escalations.jsonl`, lineNo, row.sig_hash),
|
||||
model_role: "reviewer" as ModelRole,
|
||||
prompt_tokens: row.prompt_tokens,
|
||||
completion_tokens: row.completion_tokens,
|
||||
text: row.analysis,
|
||||
}),
|
||||
},
|
||||
{
|
||||
source_file: `${ROOT}/data/_kb/audit_facts.jsonl`,
|
||||
transform: (row: any, lineNo: number) => ({
|
||||
run_id: `audit_facts:${row.head_sha}:${lineNo}`,
|
||||
task_id: `pr:${row.pr_number}`,
|
||||
timestamp: row.extracted_at,
|
||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||
provenance: provFor(`${ROOT}/data/_kb/audit_facts.jsonl`, lineNo),
|
||||
model_name: row.extractor,
|
||||
model_role: "extractor" as ModelRole,
|
||||
// facts/entities/relationships go into text as a JSON dump for now;
|
||||
// structured handling lives in Phase 2 where we map to specific
|
||||
// EvidenceRecord substructures.
|
||||
text: JSON.stringify({
|
||||
facts: row.facts?.length ?? 0,
|
||||
entities: row.entities?.length ?? 0,
|
||||
relationships: row.relationships?.length ?? 0,
|
||||
}),
|
||||
}),
|
||||
},
|
||||
];
|
||||
|
||||
interface ProbeResult {
|
||||
source_file: string;
|
||||
rows_attempted: number;
|
||||
rows_present: boolean;
|
||||
passed: number;
|
||||
failed: number;
|
||||
failure_reasons: string[]; // unique error strings, top 5
|
||||
}
|
||||
|
||||
const RESULTS: ProbeResult[] = [];
|
||||
|
||||
for (const probe of PROBES) {
|
||||
const sourceLabel = probe.source_file.replace(`${ROOT}/`, "");
|
||||
|
||||
test(`real-data: ${sourceLabel}`, () => {
|
||||
const result: ProbeResult = {
|
||||
source_file: sourceLabel,
|
||||
rows_attempted: 0,
|
||||
rows_present: false,
|
||||
passed: 0,
|
||||
failed: 0,
|
||||
failure_reasons: [],
|
||||
};
|
||||
|
||||
if (!existsSync(probe.source_file)) {
|
||||
RESULTS.push(result);
|
||||
// Skip silently — fresh clones won't have these files
|
||||
return;
|
||||
}
|
||||
|
||||
result.rows_present = true;
|
||||
const lines = readFileSync(probe.source_file, "utf8").split("\n").filter(Boolean).slice(0, SAMPLE_PER_SOURCE);
|
||||
const reasons = new Set<string>();
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
result.rows_attempted++;
|
||||
let row: unknown;
|
||||
try { row = JSON.parse(lines[i]); }
|
||||
catch { continue; }
|
||||
|
||||
const transformed = probe.transform(row, i);
|
||||
if (!transformed) continue;
|
||||
|
||||
const v = validateEvidenceRecord(transformed);
|
||||
if (v.valid) result.passed++;
|
||||
else {
|
||||
result.failed++;
|
||||
for (const e of v.errors) reasons.add(e);
|
||||
}
|
||||
}
|
||||
result.failure_reasons = Array.from(reasons).slice(0, 5);
|
||||
RESULTS.push(result);
|
||||
|
||||
// Test passes as long as we attempted something and got a result.
|
||||
// Per-source pass/fail counts are reported in the markdown writeup.
|
||||
expect(result.rows_attempted).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
}
|
||||
|
||||
test("real-data: emit markdown report", () => {
|
||||
const md: string[] = [];
|
||||
md.push("# Real-data validation report");
|
||||
md.push("");
|
||||
md.push("Schema = EvidenceRecord v" + EVIDENCE_SCHEMA_VERSION + ". Sample = first " + SAMPLE_PER_SOURCE + " rows per source.");
|
||||
md.push("");
|
||||
md.push("| Source | Present | Rows | Pass | Fail | Pass% |");
|
||||
md.push("|---|---|---|---|---|---|");
|
||||
for (const r of RESULTS) {
|
||||
const pct = r.rows_attempted > 0 ? Math.round(100 * r.passed / r.rows_attempted) + "%" : "—";
|
||||
md.push(`| ${r.source_file} | ${r.rows_present ? "✓" : "—"} | ${r.rows_attempted} | ${r.passed} | ${r.failed} | ${pct} |`);
|
||||
}
|
||||
md.push("");
|
||||
let hasFailures = false;
|
||||
for (const r of RESULTS) {
|
||||
if (r.failed > 0) {
|
||||
hasFailures = true;
|
||||
md.push(`## Failures in ${r.source_file}`);
|
||||
for (const reason of r.failure_reasons) md.push(`- \`${reason}\``);
|
||||
md.push("");
|
||||
}
|
||||
}
|
||||
if (!hasFailures) {
|
||||
md.push("**No failures across all probed sources.** Every materialized record validates against EvidenceRecord v1.");
|
||||
md.push("");
|
||||
}
|
||||
// Stale extraction probe: explicit pass/fail
|
||||
const distilledFacts = RESULTS.find(r => r.source_file.endsWith("distilled_facts.jsonl"));
|
||||
const distilledProc = RESULTS.find(r => r.source_file.endsWith("distilled_procedures.jsonl"));
|
||||
md.push("## Stale-extraction probe");
|
||||
md.push("");
|
||||
if (distilledFacts && distilledFacts.rows_present && distilledFacts.passed > 0) {
|
||||
md.push(`- **distilled_facts.jsonl:** ${distilledFacts.passed}/${distilledFacts.rows_attempted} materialize cleanly. Stream is alive at the schema level.`);
|
||||
} else if (distilledFacts && !distilledFacts.rows_present) {
|
||||
md.push(`- **distilled_facts.jsonl:** missing — stale or never produced. Phase 2 sources from live streams instead.`);
|
||||
} else {
|
||||
md.push(`- **distilled_facts.jsonl:** present but materialization failures; treat as suspect, prefer mode_experiments + scrum_reviews.`);
|
||||
}
|
||||
if (distilledProc && distilledProc.rows_present && distilledProc.passed > 0) {
|
||||
md.push(`- **distilled_procedures.jsonl:** ${distilledProc.passed}/${distilledProc.rows_attempted} materialize cleanly.`);
|
||||
}
|
||||
md.push("");
|
||||
|
||||
// Write the markdown to a stable path and stdout
|
||||
const out = md.join("\n");
|
||||
Bun.write(`${ROOT}/data/_kb/realdata_validation_report.md`, out);
|
||||
console.log("\n" + out);
|
||||
});
|
||||
@ -1,111 +0,0 @@
|
||||
// Receipt — per-pipeline-stage record with everything needed to
|
||||
// reproduce the run. Spec non-negotiable: substantive receipts, not
|
||||
// "ran successfully". Every field below has a deterministic source so
|
||||
// the receipt schema validator catches "I forgot to fill it in" the
|
||||
// same way it catches type errors.
|
||||
import {
|
||||
ValidationResult, requireString, requireNumber, requireIsoTimestamp,
|
||||
} from "./types";
|
||||
|
||||
export const RECEIPT_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface FileReference {
|
||||
path: string; // relative to repo root
|
||||
sha256: string; // hex
|
||||
bytes?: number; // optional but recommended
|
||||
}
|
||||
|
||||
export interface Receipt {
|
||||
schema_version: number;
|
||||
command: string; // shell-line or script identifier
|
||||
git_sha: string; // 40-char hex (full SHA1)
|
||||
git_branch?: string;
|
||||
git_dirty?: boolean; // true if working tree had uncommitted changes
|
||||
started_at: string; // ISO 8601
|
||||
ended_at: string; // ISO 8601
|
||||
duration_ms: number;
|
||||
input_files: FileReference[];
|
||||
output_files: FileReference[];
|
||||
record_counts: {
|
||||
in: number;
|
||||
out: number;
|
||||
[key: string]: number; // per-stage extras (filtered, dropped, etc.)
|
||||
};
|
||||
validation_pass: boolean; // explicit — never inferred
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
function validateFileRef(v: unknown, field: string, errors: string[]): boolean {
|
||||
if (typeof v !== "object" || v === null) {
|
||||
errors.push(`${field}: expected object`);
|
||||
return false;
|
||||
}
|
||||
const f = v as Record<string, unknown>;
|
||||
let ok = true;
|
||||
ok = requireString(f.path, `${field}.path`, errors) && ok;
|
||||
if (typeof f.sha256 !== "string" || !/^[0-9a-f]{64}$/.test(f.sha256)) {
|
||||
errors.push(`${field}.sha256: must be hex sha256`);
|
||||
ok = false;
|
||||
}
|
||||
if (f.bytes !== undefined && typeof f.bytes !== "number") {
|
||||
errors.push(`${field}.bytes: expected number when present`);
|
||||
ok = false;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
export function validateReceipt(input: unknown): ValidationResult<Receipt> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object"] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== RECEIPT_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${RECEIPT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.command, "command", errors) && ok;
|
||||
if (typeof r.git_sha !== "string" || !/^[0-9a-f]{40}$/.test(r.git_sha as string)) {
|
||||
errors.push("git_sha: must be 40-char hex");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireIsoTimestamp(r.started_at, "started_at", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.ended_at, "ended_at", errors) && ok;
|
||||
ok = requireNumber(r.duration_ms, "duration_ms", errors) && ok;
|
||||
if (typeof r.validation_pass !== "boolean") {
|
||||
errors.push("validation_pass: must be boolean (explicit, never inferred)");
|
||||
ok = false;
|
||||
}
|
||||
if (!Array.isArray(r.input_files)) {
|
||||
errors.push("input_files: expected array");
|
||||
ok = false;
|
||||
} else {
|
||||
for (let i = 0; i < r.input_files.length; i++) {
|
||||
if (!validateFileRef(r.input_files[i], `input_files[${i}]`, errors)) ok = false;
|
||||
}
|
||||
}
|
||||
if (!Array.isArray(r.output_files)) {
|
||||
errors.push("output_files: expected array");
|
||||
ok = false;
|
||||
} else {
|
||||
for (let i = 0; i < r.output_files.length; i++) {
|
||||
if (!validateFileRef(r.output_files[i], `output_files[${i}]`, errors)) ok = false;
|
||||
}
|
||||
}
|
||||
if (typeof r.record_counts !== "object" || r.record_counts === null) {
|
||||
errors.push("record_counts: expected object");
|
||||
ok = false;
|
||||
} else {
|
||||
const rc = r.record_counts as Record<string, unknown>;
|
||||
if (typeof rc.in !== "number") { errors.push("record_counts.in: expected number"); ok = false; }
|
||||
if (typeof rc.out !== "number") { errors.push("record_counts.out: expected number"); ok = false; }
|
||||
}
|
||||
if (!Array.isArray(r.errors)) { errors.push("errors: expected array"); ok = false; }
|
||||
if (!Array.isArray(r.warnings)) { errors.push("warnings: expected array"); ok = false; }
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as Receipt };
|
||||
}
|
||||
@ -1,90 +0,0 @@
|
||||
// run_summary.ts — aggregates StageReceipt rows for one run_id.
|
||||
// Spec field set: total records processed, total accepted/rejected/
|
||||
// quarantined, dataset sizes, validation status, overall hash of run.
|
||||
|
||||
import {
|
||||
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireSha256,
|
||||
} from "./types";
|
||||
import type { StageName } from "./stage_receipt";
|
||||
|
||||
export const RUN_SUMMARY_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface RunStageSummary {
|
||||
stage: StageName;
|
||||
records_in: number;
|
||||
records_out: number;
|
||||
accepted: number;
|
||||
rejected: number;
|
||||
quarantined: number;
|
||||
skipped: number;
|
||||
passed: boolean;
|
||||
duration_ms: number;
|
||||
output_hash: string;
|
||||
}
|
||||
|
||||
export interface RunSummary {
|
||||
schema_version: number;
|
||||
run_id: string;
|
||||
started_at: string; // earliest stage timestamp
|
||||
ended_at: string; // latest stage timestamp + duration
|
||||
git_commit: string;
|
||||
stages: RunStageSummary[];
|
||||
// Aggregates across stages
|
||||
total_records_in: number;
|
||||
total_records_out: number;
|
||||
total_accepted: number;
|
||||
total_rejected: number;
|
||||
total_quarantined: number;
|
||||
total_skipped: number;
|
||||
// Dataset sizes — final outputs of each export stage
|
||||
rag_records: number;
|
||||
sft_records: number;
|
||||
preference_pairs: number;
|
||||
// Pipeline-wide pass = AND of every stage validation.passed
|
||||
overall_passed: boolean;
|
||||
// Run-wide hash: sha256 over each stage's output hash, sorted by stage name.
|
||||
// Detects ANY change in any stage output across runs.
|
||||
run_hash: string;
|
||||
total_duration_ms: number;
|
||||
}
|
||||
|
||||
export function validateRunSummary(input: unknown): ValidationResult<RunSummary> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object"] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== RUN_SUMMARY_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${RUN_SUMMARY_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.run_id, "run_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.started_at, "started_at", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.ended_at, "ended_at", errors) && ok;
|
||||
if (typeof r.git_commit !== "string" || !/^[0-9a-f]{40}$/.test(r.git_commit as string)) {
|
||||
errors.push("git_commit: must be 40-char hex");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.overall_passed !== "boolean") {
|
||||
errors.push("overall_passed: must be boolean");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireSha256(r.run_hash, "run_hash", errors) && ok;
|
||||
for (const k of ["total_records_in", "total_records_out", "total_accepted", "total_rejected",
|
||||
"total_quarantined", "total_skipped", "rag_records", "sft_records",
|
||||
"preference_pairs", "total_duration_ms"]) {
|
||||
if (typeof (r as any)[k] !== "number") {
|
||||
errors.push(`${k}: expected number`);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (!Array.isArray(r.stages)) {
|
||||
errors.push("stages: expected array");
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as RunSummary };
|
||||
}
|
||||
@ -1,367 +0,0 @@
|
||||
// Combined schema tests for ScoredRun, Receipt, Playbook,
|
||||
// ScratchpadSummary, ModelLedgerEntry, RagSample, SftSample,
|
||||
// PreferenceSample. EvidenceRecord lives in its own file because it's
|
||||
// the foundational schema and warrants the JSON-fixture round-trip
|
||||
// pattern; the rest use inline fixture makers since they're simpler.
|
||||
//
|
||||
// Each schema: 1 positive fixture + 4-5 negative cases pinning the
|
||||
// non-negotiable invariants from now.md.
|
||||
//
|
||||
// Run: bun test auditor/schemas/distillation/schemas.test.ts
|
||||
|
||||
import { test, expect } from "bun:test";
|
||||
|
||||
import { validateScoredRun, SCORED_RUN_SCHEMA_VERSION } from "./scored_run";
|
||||
import { validateReceipt, RECEIPT_SCHEMA_VERSION } from "./receipt";
|
||||
import { validatePlaybook, PLAYBOOK_SCHEMA_VERSION } from "./playbook";
|
||||
import { validateScratchpadSummary, SCRATCHPAD_SCHEMA_VERSION } from "./scratchpad_summary";
|
||||
import { validateModelLedgerEntry, MODEL_LEDGER_SCHEMA_VERSION } from "./model_ledger";
|
||||
import { validateRagSample, RAG_SAMPLE_SCHEMA_VERSION } from "./rag_sample";
|
||||
import { validateSftSample, SFT_SAMPLE_SCHEMA_VERSION } from "./sft_sample";
|
||||
import { validatePreferenceSample, PREFERENCE_SAMPLE_SCHEMA_VERSION } from "./preference_sample";
|
||||
|
||||
const NOW = "2026-04-26T22:30:00.000Z";
|
||||
const SHA = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
|
||||
const GIT_SHA = "f753e11157eef753e11157eef753e11157eef753";
|
||||
|
||||
const PROVENANCE = {
|
||||
source_file: "data/_kb/scored_runs.jsonl",
|
||||
line_offset: 0,
|
||||
sig_hash: SHA,
|
||||
recorded_at: NOW,
|
||||
};
|
||||
|
||||
// ─── ScoredRun ───────────────────────────────────────────────────────
|
||||
|
||||
const SCORED_RUN_OK = {
|
||||
schema_version: SCORED_RUN_SCHEMA_VERSION,
|
||||
evidence_run_id: "run-abc",
|
||||
evidence_task_id: "task-abc",
|
||||
category: "accepted",
|
||||
reasons: ["cargo_green=true", "anchor_grounding=0.95"],
|
||||
scored_at: NOW,
|
||||
scorer_version: "v1.0.0",
|
||||
sub_scores: { cargo_green: true, anchor_grounding: 0.95 },
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("ScoredRun: positive validates", () => {
|
||||
const r = validateScoredRun(SCORED_RUN_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("ScoredRun: empty reasons rejected (every score needs a reason)", () => {
|
||||
const r = validateScoredRun({ ...SCORED_RUN_OK, reasons: [] });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("reasons"))).toBe(true);
|
||||
});
|
||||
|
||||
test("ScoredRun: invalid category rejected", () => {
|
||||
const r = validateScoredRun({ ...SCORED_RUN_OK, category: "maybe_ok" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("category"))).toBe(true);
|
||||
});
|
||||
|
||||
test("ScoredRun: anchor_grounding > 1 rejected (must be in [0, 1])", () => {
|
||||
const r = validateScoredRun({ ...SCORED_RUN_OK, sub_scores: { ...SCORED_RUN_OK.sub_scores, anchor_grounding: 1.5 } });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("anchor_grounding"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── Receipt ─────────────────────────────────────────────────────────
|
||||
|
||||
const RECEIPT_OK = {
|
||||
schema_version: RECEIPT_SCHEMA_VERSION,
|
||||
command: "bun run scripts/build_evidence_index.ts",
|
||||
git_sha: GIT_SHA,
|
||||
git_branch: "scrum/auto-apply-19814",
|
||||
git_dirty: false,
|
||||
started_at: NOW,
|
||||
ended_at: NOW,
|
||||
duration_ms: 1234,
|
||||
input_files: [{ path: "data/_kb/scrum_reviews.jsonl", sha256: SHA, bytes: 448000 }],
|
||||
output_files: [{ path: "data/evidence/2026/04/26/run.jsonl", sha256: SHA }],
|
||||
record_counts: { in: 100, out: 95, filtered: 5 },
|
||||
validation_pass: true,
|
||||
errors: [],
|
||||
warnings: [],
|
||||
};
|
||||
|
||||
test("Receipt: positive validates", () => {
|
||||
const r = validateReceipt(RECEIPT_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("Receipt: bad git_sha rejected (must be 40-char hex)", () => {
|
||||
const r = validateReceipt({ ...RECEIPT_OK, git_sha: "abc123" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("git_sha"))).toBe(true);
|
||||
});
|
||||
|
||||
test("Receipt: validation_pass must be boolean (never inferred)", () => {
|
||||
const r = validateReceipt({ ...RECEIPT_OK, validation_pass: "yes" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("validation_pass"))).toBe(true);
|
||||
});
|
||||
|
||||
test("Receipt: file refs without proper sha256 rejected", () => {
|
||||
const r = validateReceipt({ ...RECEIPT_OK, output_files: [{ path: "x", sha256: "short" }] });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("sha256"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── Playbook ────────────────────────────────────────────────────────
|
||||
|
||||
const PLAYBOOK_OK = {
|
||||
schema_version: PLAYBOOK_SCHEMA_VERSION,
|
||||
playbook_id: "pb-scrum-review-001",
|
||||
task_type: "scrum_review",
|
||||
problem_pattern: "Cargo workspace warning escalation after applier patch",
|
||||
useful_context: ["pathway memory bug fingerprints for the file area"],
|
||||
model_routing_path: ["x-ai/grok-4.1-fast"],
|
||||
commands_worked: ["cargo check --workspace"],
|
||||
commands_failed: [],
|
||||
validation_steps: ["warning count must not increase"],
|
||||
repo_files_touched: ["crates/queryd/src/service.rs"],
|
||||
recovery_strategy: "git checkout -- file when cargo red",
|
||||
known_failure_modes: ["unused import noise"],
|
||||
escalation_threshold: "use kimi-k2:1t when isolation mode rejects 2 attempts",
|
||||
acceptance_criteria: ["cargo green", "warning count stable", "rationale-diff aligned"],
|
||||
source_run_ids: ["run-xyz", "run-abc"],
|
||||
created_at: NOW,
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("Playbook: positive validates", () => {
|
||||
const r = validatePlaybook(PLAYBOOK_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("Playbook: empty source_run_ids rejected (every playbook traces to source — spec)", () => {
|
||||
const r = validatePlaybook({ ...PLAYBOOK_OK, source_run_ids: [] });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("source_run_ids"))).toBe(true);
|
||||
});
|
||||
|
||||
test("Playbook: empty acceptance_criteria rejected (every playbook needs success criteria — spec)", () => {
|
||||
const r = validatePlaybook({ ...PLAYBOOK_OK, acceptance_criteria: [] });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("acceptance_criteria"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── ScratchpadSummary ───────────────────────────────────────────────
|
||||
|
||||
const SCRATCHPAD_OK = {
|
||||
schema_version: SCRATCHPAD_SCHEMA_VERSION,
|
||||
run_id: "run-abc",
|
||||
current_objective: "verify pr_audit mode end-to-end",
|
||||
completed_steps: ["restart gateway"],
|
||||
failed_steps: ["cloud chat returned 500"],
|
||||
pending_steps: ["swap default model"],
|
||||
important_paths: ["auditor/checks/inference.ts"],
|
||||
decisions: ["defer kimi-k2 swap until upstream returns"],
|
||||
unresolved_questions: ["does deepseek match kimi quality?"],
|
||||
validation_status: "partial",
|
||||
next_command: "bun run auditor/audit_one.ts 11",
|
||||
source_scratchpad_hash: SHA,
|
||||
summarized_at: NOW,
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("ScratchpadSummary: positive validates", () => {
|
||||
const r = validateScratchpadSummary(SCRATCHPAD_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("ScratchpadSummary: invalid validation_status rejected", () => {
|
||||
const r = validateScratchpadSummary({ ...SCRATCHPAD_OK, validation_status: "tbd" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("validation_status"))).toBe(true);
|
||||
});
|
||||
|
||||
test("ScratchpadSummary: short scratchpad_hash rejected", () => {
|
||||
const r = validateScratchpadSummary({ ...SCRATCHPAD_OK, source_scratchpad_hash: "short" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("source_scratchpad_hash"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── ModelLedgerEntry ────────────────────────────────────────────────
|
||||
|
||||
const LEDGER_OK = {
|
||||
schema_version: MODEL_LEDGER_SCHEMA_VERSION,
|
||||
model_name: "kimi-k2:1t",
|
||||
model_provider: "ollama_cloud",
|
||||
task_type: "pr_audit",
|
||||
success_rate: 0.85,
|
||||
failure_modes: ["upstream_500", "context_truncation"],
|
||||
best_partner_model: "x-ai/grok-4.1-fast",
|
||||
escalation_role: "primary",
|
||||
cost_usd_p50: 0.0002,
|
||||
latency_ms_p50: 50000,
|
||||
latency_ms_p95: 90000,
|
||||
context_window: 200000,
|
||||
sample_count: 47,
|
||||
last_updated: NOW,
|
||||
};
|
||||
|
||||
test("ModelLedgerEntry: positive validates", () => {
|
||||
const r = validateModelLedgerEntry(LEDGER_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("ModelLedgerEntry: success_rate > 1 rejected", () => {
|
||||
const r = validateModelLedgerEntry({ ...LEDGER_OK, success_rate: 1.5 });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("success_rate"))).toBe(true);
|
||||
});
|
||||
|
||||
test("ModelLedgerEntry: zero sample_count rejected (no aggregate from zero)", () => {
|
||||
const r = validateModelLedgerEntry({ ...LEDGER_OK, sample_count: 0 });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("sample_count"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── RagSample ───────────────────────────────────────────────────────
|
||||
|
||||
const RAG_OK = {
|
||||
schema_version: RAG_SAMPLE_SCHEMA_VERSION,
|
||||
id: "rag-pb-001",
|
||||
title: "Scrum applier rationale-diff alignment",
|
||||
content: "When the applier emits a patch with rationale claiming X but the diff shows Y, the rationale-token alignment gate catches it...",
|
||||
tags: ["scrum_review", "applier"],
|
||||
source_run_id: "run-xyz",
|
||||
success_score: "accepted",
|
||||
source_category: "accepted",
|
||||
embedding_text: "applier rationale-diff alignment guard scrum",
|
||||
created_at: NOW,
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("RagSample: positive validates", () => {
|
||||
const r = validateRagSample(RAG_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("RagSample: success_score=rejected forbidden (RAG never takes rejected)", () => {
|
||||
const r = validateRagSample({ ...RAG_OK, success_score: "rejected", source_category: "rejected" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("success_score"))).toBe(true);
|
||||
});
|
||||
|
||||
test("RagSample: success_score and source_category must match", () => {
|
||||
const r = validateRagSample({ ...RAG_OK, success_score: "accepted", source_category: "partially_accepted" });
|
||||
expect(r.valid).toBe(false);
|
||||
});
|
||||
|
||||
test("RagSample: whitespace-only content rejected", () => {
|
||||
const r = validateRagSample({ ...RAG_OK, content: " \n " });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("content"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── SftSample (the strict one) ──────────────────────────────────────
|
||||
|
||||
const SFT_OK = {
|
||||
schema_version: SFT_SAMPLE_SCHEMA_VERSION,
|
||||
id: "sft-pr11-001",
|
||||
instruction: "Audit this PR diff against ship-claims.",
|
||||
context: "claims: 3 strong, 2 moderate",
|
||||
response: "{\"claim_verdicts\": [...]}",
|
||||
source_run_id: "run-pr11",
|
||||
quality_score: "accepted",
|
||||
created_at: NOW,
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("SftSample: positive validates", () => {
|
||||
const r = validateSftSample(SFT_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: quality_score=partially_accepted ACCEPTED (--include-partial path)", () => {
|
||||
// Phase 4 update: partial allowed at schema layer; CLI gate decides.
|
||||
const r = validateSftSample({ ...SFT_OK, quality_score: "partially_accepted" });
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: quality_score=rejected REJECTED (spec non-negotiable, no leak)", () => {
|
||||
const r = validateSftSample({ ...SFT_OK, quality_score: "rejected" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("quality_score"))).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: quality_score=needs_human_review REJECTED (no leak)", () => {
|
||||
const r = validateSftSample({ ...SFT_OK, quality_score: "needs_human_review" });
|
||||
expect(r.valid).toBe(false);
|
||||
});
|
||||
|
||||
test("SftSample: missing context rejected (must be string, even if empty)", () => {
|
||||
const fixture: Record<string, unknown> = { ...SFT_OK };
|
||||
delete fixture.context;
|
||||
const r = validateSftSample(fixture);
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("context"))).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: empty-string context allowed", () => {
|
||||
const r = validateSftSample({ ...SFT_OK, context: "" });
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: empty response rejected (no empty pairs)", () => {
|
||||
const r = validateSftSample({ ...SFT_OK, response: "" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("response"))).toBe(true);
|
||||
});
|
||||
|
||||
test("SftSample: whitespace-only instruction rejected", () => {
|
||||
const r = validateSftSample({ ...SFT_OK, instruction: " \t\n " });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("instruction"))).toBe(true);
|
||||
});
|
||||
|
||||
// ─── PreferenceSample ────────────────────────────────────────────────
|
||||
|
||||
const PREF_OK = {
|
||||
schema_version: PREFERENCE_SAMPLE_SCHEMA_VERSION,
|
||||
id: "pref-task-x-001",
|
||||
prompt: "Verify claim: 'all 3 services running on matrix-test'",
|
||||
chosen: "{\"backed\": true, \"evidence\": \"systemctl status confirms 3 active\"}",
|
||||
rejected: "{\"backed\": true, \"evidence\": \"the README says so\"}",
|
||||
reason: "chosen cites runtime evidence, rejected cites doc claim only",
|
||||
chosen_run_id: "run-A",
|
||||
rejected_run_id: "run-B",
|
||||
created_at: NOW,
|
||||
provenance: PROVENANCE,
|
||||
};
|
||||
|
||||
test("PreferenceSample: positive validates", () => {
|
||||
const r = validatePreferenceSample(PREF_OK);
|
||||
if (!r.valid) console.error(r.errors);
|
||||
expect(r.valid).toBe(true);
|
||||
});
|
||||
|
||||
test("PreferenceSample: chosen == rejected rejected (no self-pairing)", () => {
|
||||
const r = validatePreferenceSample({ ...PREF_OK, chosen: "x", rejected: "x" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("chosen and rejected"))).toBe(true);
|
||||
});
|
||||
|
||||
test("PreferenceSample: chosen_run_id == rejected_run_id rejected (no self-disagreement)", () => {
|
||||
const r = validatePreferenceSample({ ...PREF_OK, chosen_run_id: "run-A", rejected_run_id: "run-A" });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("chosen_run_id"))).toBe(true);
|
||||
});
|
||||
|
||||
test("PreferenceSample: empty reason rejected (every preference needs WHY)", () => {
|
||||
const r = validatePreferenceSample({ ...PREF_OK, reason: " " });
|
||||
expect(r.valid).toBe(false);
|
||||
if (!r.valid) expect(r.errors.some(e => e.includes("reason"))).toBe(true);
|
||||
});
|
||||
@ -1,86 +0,0 @@
|
||||
// ScoredRun — output of the deterministic Success Scorer (Phase 3).
|
||||
// Spec mandates 4 categories with explicit reasons; we add scorer
|
||||
// versioning so a future scorer change is detectable in historical data.
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray, requireNumber,
|
||||
} from "./types";
|
||||
|
||||
export const SCORED_RUN_SCHEMA_VERSION = 1;
|
||||
export const SCORE_CATEGORIES = ["accepted", "partially_accepted", "rejected", "needs_human_review"] as const;
|
||||
export type ScoreCategory = (typeof SCORE_CATEGORIES)[number];
|
||||
|
||||
export interface ScoredRun {
|
||||
schema_version: number;
|
||||
evidence_run_id: string; // FK to EvidenceRecord.run_id
|
||||
evidence_task_id: string; // FK to EvidenceRecord.task_id
|
||||
category: ScoreCategory;
|
||||
reasons: string[]; // human-readable, e.g. ["cargo_green=true", "anchor_grounding<0.7"]
|
||||
scored_at: string; // ISO 8601
|
||||
scorer_version: string; // e.g. "v1.0.0" — bumped on scorer code change
|
||||
// Sub-scores that the scorer collapsed into the category. Persisted
|
||||
// so a downstream UI can show "why" without re-running the scorer.
|
||||
sub_scores?: {
|
||||
cargo_green?: boolean;
|
||||
anchor_grounding?: number;
|
||||
schema_valid?: boolean;
|
||||
pathway_replay_succeeded?: boolean;
|
||||
observer_verdict?: "accept" | "reject" | "cycle";
|
||||
[key: string]: unknown;
|
||||
};
|
||||
provenance: {
|
||||
source_file: string;
|
||||
line_offset?: number;
|
||||
sig_hash: string;
|
||||
recorded_at: string;
|
||||
};
|
||||
}
|
||||
|
||||
export function validateScoredRun(input: unknown): ValidationResult<ScoredRun> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object"] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
if (r.schema_version !== SCORED_RUN_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${SCORED_RUN_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.evidence_run_id, "evidence_run_id", errors) && ok;
|
||||
ok = requireString(r.evidence_task_id, "evidence_task_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.scored_at, "scored_at", errors) && ok;
|
||||
ok = requireString(r.scorer_version, "scorer_version", errors) && ok;
|
||||
ok = requireStringArray(r.reasons, "reasons", errors) && ok;
|
||||
if (Array.isArray(r.reasons) && r.reasons.length === 0) {
|
||||
errors.push("reasons: must be non-empty (every score must have at least one reason)");
|
||||
ok = false;
|
||||
}
|
||||
if (!SCORE_CATEGORIES.includes(r.category as ScoreCategory)) {
|
||||
errors.push(`category: must be one of ${SCORE_CATEGORIES.join("|")}, got ${JSON.stringify(r.category)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
if (r.sub_scores !== undefined) {
|
||||
if (typeof r.sub_scores !== "object" || r.sub_scores === null) {
|
||||
errors.push("sub_scores: expected object when present");
|
||||
ok = false;
|
||||
} else {
|
||||
const ss = r.sub_scores as Record<string, unknown>;
|
||||
if (ss.anchor_grounding !== undefined) {
|
||||
if (!requireNumber(ss.anchor_grounding, "sub_scores.anchor_grounding", errors)) ok = false;
|
||||
else if ((ss.anchor_grounding as number) < 0 || (ss.anchor_grounding as number) > 1) {
|
||||
errors.push("sub_scores.anchor_grounding: must be in [0, 1]");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (ss.observer_verdict !== undefined && !["accept", "reject", "cycle"].includes(ss.observer_verdict as string)) {
|
||||
errors.push("sub_scores.observer_verdict: must be accept|reject|cycle");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as ScoredRun };
|
||||
}
|
||||
@ -1,65 +0,0 @@
|
||||
// ScratchpadSummary — structured normalization of a tree-split or
|
||||
// long-running scratchpad. Distinct from EvidenceRecord because a
|
||||
// scratchpad accumulates across many calls; this schema captures the
|
||||
// state at a checkpoint moment.
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const SCRATCHPAD_SCHEMA_VERSION = 1;
|
||||
|
||||
export interface ScratchpadSummary {
|
||||
schema_version: number;
|
||||
run_id: string;
|
||||
current_objective: string;
|
||||
completed_steps: string[];
|
||||
failed_steps: string[];
|
||||
pending_steps: string[];
|
||||
important_paths: string[]; // file paths the scratchpad references
|
||||
decisions: string[]; // architectural/scope decisions made
|
||||
unresolved_questions: string[];
|
||||
validation_status: "pass" | "fail" | "partial" | "pending";
|
||||
next_command?: string; // recommendation for next action
|
||||
source_scratchpad_hash: string; // sha256 of the full source scratchpad text — diff detection
|
||||
summarized_at: string; // ISO 8601
|
||||
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
|
||||
}
|
||||
|
||||
const STATUS = ["pass", "fail", "partial", "pending"];
|
||||
|
||||
export function validateScratchpadSummary(input: unknown): ValidationResult<ScratchpadSummary> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== SCRATCHPAD_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${SCRATCHPAD_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.run_id, "run_id", errors) && ok;
|
||||
ok = requireString(r.current_objective, "current_objective", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.summarized_at, "summarized_at", errors) && ok;
|
||||
if (typeof r.source_scratchpad_hash !== "string" || !/^[0-9a-f]{64}$/.test(r.source_scratchpad_hash as string)) {
|
||||
errors.push("source_scratchpad_hash: must be hex sha256");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireStringArray(r.completed_steps, "completed_steps", errors) && ok;
|
||||
ok = requireStringArray(r.failed_steps, "failed_steps", errors) && ok;
|
||||
ok = requireStringArray(r.pending_steps, "pending_steps", errors) && ok;
|
||||
ok = requireStringArray(r.important_paths, "important_paths", errors) && ok;
|
||||
ok = requireStringArray(r.decisions, "decisions", errors) && ok;
|
||||
ok = requireStringArray(r.unresolved_questions, "unresolved_questions", errors) && ok;
|
||||
if (!STATUS.includes(r.validation_status as string)) {
|
||||
errors.push(`validation_status: must be one of ${STATUS.join("|")}`);
|
||||
ok = false;
|
||||
}
|
||||
if (r.next_command !== undefined && typeof r.next_command !== "string") {
|
||||
errors.push("next_command: expected string when present");
|
||||
ok = false;
|
||||
}
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as ScratchpadSummary };
|
||||
}
|
||||
@ -1,69 +0,0 @@
|
||||
// SftSample — entry in exports/sft/instruction_response.jsonl. Spec
|
||||
// non-negotiable: ONLY accepted runs, never partial/rejected/needs_human.
|
||||
// Validator enforces that invariant — exporters can't bypass.
|
||||
import {
|
||||
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireNumber,
|
||||
} from "./types";
|
||||
|
||||
export const SFT_SAMPLE_SCHEMA_VERSION = 1;
|
||||
|
||||
// SFT default: only `accepted` ships. With --include-partial CLI flag,
|
||||
// `partially_accepted` becomes legal. `rejected` and `needs_human_review`
|
||||
// NEVER ship to SFT — that's the contamination firewall.
|
||||
export const SFT_QUALITY_SCORES = ["accepted", "partially_accepted"] as const;
|
||||
export type SftQualityScore = (typeof SFT_QUALITY_SCORES)[number];
|
||||
|
||||
export interface SftSample {
|
||||
schema_version: number;
|
||||
id: string;
|
||||
instruction: string; // the prompt / user message
|
||||
context: string; // retrieved context that was visible (empty string allowed; null/undefined not)
|
||||
response: string; // the model output that was accepted
|
||||
source_run_id: string;
|
||||
quality_score: SftQualityScore;
|
||||
created_at: string;
|
||||
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
|
||||
}
|
||||
|
||||
export function validateSftSample(input: unknown): ValidationResult<SftSample> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== SFT_SAMPLE_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${SFT_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.id, "id", errors) && ok;
|
||||
ok = requireString(r.instruction, "instruction", errors) && ok;
|
||||
ok = requireString(r.response, "response", errors) && ok;
|
||||
ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
|
||||
ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
|
||||
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
|
||||
|
||||
// Empty pair guard.
|
||||
if (typeof r.instruction === "string" && (r.instruction as string).trim().length === 0) {
|
||||
errors.push("instruction: must be non-whitespace (no empty pairs)");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.response === "string" && (r.response as string).trim().length === 0) {
|
||||
errors.push("response: must be non-whitespace (no empty pairs)");
|
||||
ok = false;
|
||||
}
|
||||
// Context is required-string but empty is allowed (some SFT samples
|
||||
// are pure instruction→response with no retrieval context).
|
||||
if (typeof r.context !== "string") {
|
||||
errors.push("context: expected string (use empty string for no-context samples)");
|
||||
ok = false;
|
||||
}
|
||||
// The non-negotiable: SFT samples MUST have quality_score in
|
||||
// SFT_QUALITY_SCORES. Anything else is a leak.
|
||||
if (!SFT_QUALITY_SCORES.includes(r.quality_score as SftQualityScore)) {
|
||||
errors.push(`quality_score: must be one of ${SFT_QUALITY_SCORES.join("|")} (no rejected/needs_human leak into SFT — spec non-negotiable). Got ${JSON.stringify(r.quality_score)}`);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as SftSample };
|
||||
}
|
||||
@ -1,190 +0,0 @@
|
||||
// stage_receipt.ts — forensic-grade per-stage receipt.
|
||||
//
|
||||
// Distinct from auditor/schemas/distillation/receipt.ts (Phase 1):
|
||||
// - Phase 1 Receipt is per-script invocation, format inherited from
|
||||
// the early auditor wiring
|
||||
// - StageReceipt (THIS file) matches the now.md Phase 5 spec exactly
|
||||
// and is the canonical artifact for pipeline observability
|
||||
//
|
||||
// Every pipeline stage (collect, score, export-rag, export-sft,
|
||||
// export-preference, future extract-playbooks/index) emits ONE
|
||||
// StageReceipt per run. Receipts are joined by `run_id` (shared
|
||||
// across all stages of a single `run-all` invocation) so a future
|
||||
// query can aggregate across the whole pipeline.
|
||||
|
||||
import {
|
||||
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireSha256,
|
||||
requireStringArray,
|
||||
} from "./types";
|
||||
|
||||
export const STAGE_RECEIPT_SCHEMA_VERSION = 1;
|
||||
|
||||
export const STAGE_NAMES = [
|
||||
"collect", // build_evidence_index — materialize source jsonls → EvidenceRecord
|
||||
"score", // score_runs — EvidenceRecord → ScoredRun
|
||||
"export-rag", // exports/rag/playbooks.jsonl
|
||||
"export-sft", // exports/sft/instruction_response.jsonl
|
||||
"export-preference",// exports/preference/chosen_rejected.jsonl
|
||||
// Reserved for future stages — accept them in the schema so a stage
|
||||
// can be added without bumping schema_version.
|
||||
"extract-playbooks",
|
||||
"index",
|
||||
] as const;
|
||||
export type StageName = (typeof STAGE_NAMES)[number];
|
||||
|
||||
export interface StageFileRef {
|
||||
path: string; // relative to repo root
|
||||
sha256: string; // 64-char hex
|
||||
bytes?: number;
|
||||
record_count?: number; // line count for jsonl, when meaningful
|
||||
}
|
||||
|
||||
export interface StageIO {
|
||||
files: StageFileRef[];
|
||||
record_count: number;
|
||||
hash: string; // 64-char hex — aggregate over all file hashes (sorted)
|
||||
}
|
||||
|
||||
export interface StageStats {
|
||||
accepted: number; // rows that ended up in the stage's output
|
||||
rejected: number; // explicit category=rejected (Score), invalid pairs (Preference), etc.
|
||||
quarantined: number; // routed to exports/quarantine/* with structured reason
|
||||
skipped: number; // parse failures, schema violations at write time
|
||||
}
|
||||
|
||||
export interface StageValidation {
|
||||
passed: boolean; // explicit boolean — never inferred (spec non-negotiable)
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface StageReceipt {
|
||||
schema_version: number;
|
||||
run_id: string; // shared across all stages of one pipeline run
|
||||
stage: StageName;
|
||||
timestamp: string; // ISO 8601 — stage start
|
||||
git_commit: string; // 40-char hex
|
||||
inputs: StageIO;
|
||||
outputs: StageIO;
|
||||
stats: StageStats;
|
||||
validation: StageValidation;
|
||||
duration_ms: number;
|
||||
}
|
||||
|
||||
function validateStageIO(v: unknown, field: string, errors: string[]): boolean {
|
||||
if (typeof v !== "object" || v === null) {
|
||||
errors.push(`${field}: expected object`);
|
||||
return false;
|
||||
}
|
||||
const io = v as Record<string, unknown>;
|
||||
let ok = true;
|
||||
if (!Array.isArray(io.files)) {
|
||||
errors.push(`${field}.files: expected array`);
|
||||
ok = false;
|
||||
} else {
|
||||
for (let i = 0; i < io.files.length; i++) {
|
||||
const f = io.files[i] as Record<string, unknown>;
|
||||
if (typeof f !== "object" || f === null) {
|
||||
errors.push(`${field}.files[${i}]: expected object`);
|
||||
ok = false;
|
||||
continue;
|
||||
}
|
||||
ok = requireString(f.path, `${field}.files[${i}].path`, errors) && ok;
|
||||
ok = requireSha256(f.sha256, `${field}.files[${i}].sha256`, errors) && ok;
|
||||
if (f.bytes !== undefined && typeof f.bytes !== "number") {
|
||||
errors.push(`${field}.files[${i}].bytes: expected number when present`);
|
||||
ok = false;
|
||||
}
|
||||
if (f.record_count !== undefined && typeof f.record_count !== "number") {
|
||||
errors.push(`${field}.files[${i}].record_count: expected number when present`);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
ok = requireNumber(io.record_count, `${field}.record_count`, errors) && ok;
|
||||
ok = requireSha256(io.hash, `${field}.hash`, errors) && ok;
|
||||
return ok;
|
||||
}
|
||||
|
||||
export function validateStageReceipt(input: unknown): ValidationResult<StageReceipt> {
|
||||
const errors: string[] = [];
|
||||
if (typeof input !== "object" || input === null) {
|
||||
return { valid: false, errors: ["expected object"] };
|
||||
}
|
||||
const r = input as Record<string, unknown>;
|
||||
let ok = true;
|
||||
|
||||
if (r.schema_version !== STAGE_RECEIPT_SCHEMA_VERSION) {
|
||||
errors.push(`schema_version: expected ${STAGE_RECEIPT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireString(r.run_id, "run_id", errors) && ok;
|
||||
if (typeof r.run_id === "string" && r.run_id.length < 8) {
|
||||
errors.push("run_id: too short — expect uuid-like");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.stage !== "string" || !STAGE_NAMES.includes(r.stage as StageName)) {
|
||||
errors.push(`stage: must be one of ${STAGE_NAMES.join("|")}`);
|
||||
ok = false;
|
||||
}
|
||||
ok = requireIsoTimestamp(r.timestamp, "timestamp", errors) && ok;
|
||||
if (typeof r.git_commit !== "string" || !/^[0-9a-f]{40}$/.test(r.git_commit as string)) {
|
||||
errors.push("git_commit: must be 40-char hex");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.duration_ms !== "number") {
|
||||
errors.push("duration_ms: expected number");
|
||||
ok = false;
|
||||
}
|
||||
if (typeof r.inputs !== "object" || r.inputs === null) {
|
||||
errors.push("inputs: expected object");
|
||||
ok = false;
|
||||
} else {
|
||||
ok = validateStageIO(r.inputs, "inputs", errors) && ok;
|
||||
}
|
||||
if (typeof r.outputs !== "object" || r.outputs === null) {
|
||||
errors.push("outputs: expected object");
|
||||
ok = false;
|
||||
} else {
|
||||
ok = validateStageIO(r.outputs, "outputs", errors) && ok;
|
||||
}
|
||||
if (typeof r.stats !== "object" || r.stats === null) {
|
||||
errors.push("stats: expected object");
|
||||
ok = false;
|
||||
} else {
|
||||
const s = r.stats as Record<string, unknown>;
|
||||
for (const k of ["accepted", "rejected", "quarantined", "skipped"]) {
|
||||
if (typeof s[k] !== "number") { errors.push(`stats.${k}: expected number`); ok = false; }
|
||||
}
|
||||
}
|
||||
if (typeof r.validation !== "object" || r.validation === null) {
|
||||
errors.push("validation: expected object");
|
||||
ok = false;
|
||||
} else {
|
||||
const v = r.validation as Record<string, unknown>;
|
||||
if (typeof v.passed !== "boolean") {
|
||||
errors.push("validation.passed: must be boolean (explicit, never inferred)");
|
||||
ok = false;
|
||||
}
|
||||
if (!Array.isArray(v.errors)) { errors.push("validation.errors: expected array"); ok = false; }
|
||||
if (!Array.isArray(v.warnings)) { errors.push("validation.warnings: expected array"); ok = false; }
|
||||
if (Array.isArray(v.errors)) ok = requireStringArray(v.errors, "validation.errors", errors) && ok;
|
||||
if (Array.isArray(v.warnings)) ok = requireStringArray(v.warnings, "validation.warnings", errors) && ok;
|
||||
}
|
||||
|
||||
if (!ok) return { valid: false, errors };
|
||||
return { valid: true, value: r as unknown as StageReceipt };
|
||||
}
|
||||
|
||||
// Compute the canonical aggregate hash over a list of file refs.
|
||||
// Sorted by path so order-of-iteration doesn't drift the hash.
|
||||
// Each entry contributes "<path>|<sha256>|<record_count>" so two
|
||||
// files with identical content but different paths produce distinct
|
||||
// digests (real difference = real hash difference).
|
||||
export async function aggregateIoHash(files: StageFileRef[]): Promise<string> {
|
||||
const sorted = [...files].sort((a, b) => a.path.localeCompare(b.path));
|
||||
const parts = sorted.map(f => `${f.path}|${f.sha256}|${f.record_count ?? 0}`);
|
||||
const h = new Bun.CryptoHasher("sha256");
|
||||
h.update(parts.join("\n"));
|
||||
return h.digest("hex");
|
||||
}
|
||||
@ -1,141 +0,0 @@
|
||||
// Shared types for distillation schemas. Hand-rolled validators (no Zod
|
||||
// dependency) — bun:test runs them; runtime cost is one tiny function
|
||||
// per record. Pattern: each schema exports `validate(x): ValidationResult`
|
||||
// returning `{valid: true, value}` or `{valid: false, errors}`.
|
||||
//
|
||||
// Why hand-rolled: the auditor + scrum + observer pipelines emit JSONL
|
||||
// rows in shapes that already work; we want to ENFORCE those shapes
|
||||
// without adding a 100KB dependency or rewriting producers. The
|
||||
// validators codify what we already produce.
|
||||
//
|
||||
// Naming: schemas live as nouns (`EvidenceRecord`), validators as
|
||||
// `validate<Noun>`. Each schema file exports both the type and the
|
||||
// validator.
|
||||
|
||||
export interface Provenance {
|
||||
// Path to the JSONL or other source where this row came from. Always
|
||||
// relative to /home/profit/lakehouse so receipts are reproducible
|
||||
// across deploys with the same repo layout.
|
||||
source_file: string;
|
||||
|
||||
// Optional byte offset / line number into the source file. Lets a
|
||||
// future "open the source row" UI jump directly to the line. Some
|
||||
// sources (single-row JSON files like _playbook_lessons/*.json) don't
|
||||
// need this.
|
||||
line_offset?: number;
|
||||
|
||||
// SHA-256 of the canonical JSON of the source row (sorted keys, no
|
||||
// whitespace). This is the dedup key — running distillation twice on
|
||||
// the same source produces identical sig_hash, so duplicates are
|
||||
// detectable without full row comparison.
|
||||
sig_hash: string;
|
||||
|
||||
// ISO 8601 of when this provenance link was recorded — usually the
|
||||
// moment the unified Evidence Index ran. Distinct from the source
|
||||
// row's own timestamp, which lives on the EvidenceRecord itself.
|
||||
recorded_at: string;
|
||||
}
|
||||
|
||||
// Returned by every schema validator. The shape is `{valid: true, value}`
|
||||
// for success (so callers can use `value` with the right type narrowed)
|
||||
// or `{valid: false, errors}` for failure (so callers can surface
|
||||
// every error at once, not just the first).
|
||||
export type ValidationResult<T> =
|
||||
| { valid: true; value: T }
|
||||
| { valid: false; errors: string[] };
|
||||
|
||||
// Standard helpers used by every schema. Centralized so naming +
|
||||
// error message format stay consistent across schemas.
|
||||
|
||||
export function requireString(v: unknown, field: string, errors: string[]): v is string {
|
||||
if (typeof v !== "string") {
|
||||
errors.push(`${field}: expected string, got ${typeof v}`);
|
||||
return false;
|
||||
}
|
||||
if (v.length === 0) {
|
||||
errors.push(`${field}: must be non-empty`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function requireNumber(v: unknown, field: string, errors: string[]): v is number {
|
||||
if (typeof v !== "number" || !Number.isFinite(v)) {
|
||||
errors.push(`${field}: expected finite number, got ${typeof v}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function requireIsoTimestamp(v: unknown, field: string, errors: string[]): v is string {
|
||||
if (!requireString(v, field, errors)) return false;
|
||||
// Permissive ISO 8601: YYYY-MM-DDTHH:MM:SS(.fraction)?(Z|±HH:MM)?
|
||||
const re = /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?$/;
|
||||
if (!re.test(v as string)) {
|
||||
errors.push(`${field}: not a valid ISO 8601 timestamp: ${(v as string).slice(0, 60)}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function requireSha256(v: unknown, field: string, errors: string[]): v is string {
|
||||
if (!requireString(v, field, errors)) return false;
|
||||
if (!/^[0-9a-f]{64}$/.test(v as string)) {
|
||||
errors.push(`${field}: not a valid hex sha256: ${(v as string).slice(0, 80)}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function requireProvenance(v: unknown, field: string, errors: string[]): v is Provenance {
|
||||
if (typeof v !== "object" || v === null) {
|
||||
errors.push(`${field}: expected object, got ${v === null ? "null" : typeof v}`);
|
||||
return false;
|
||||
}
|
||||
const p = v as Record<string, unknown>;
|
||||
let ok = true;
|
||||
ok = requireString(p.source_file, `${field}.source_file`, errors) && ok;
|
||||
ok = requireSha256(p.sig_hash, `${field}.sig_hash`, errors) && ok;
|
||||
ok = requireIsoTimestamp(p.recorded_at, `${field}.recorded_at`, errors) && ok;
|
||||
if (p.line_offset !== undefined && typeof p.line_offset !== "number") {
|
||||
errors.push(`${field}.line_offset: expected number when present`);
|
||||
ok = false;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
export function requireStringArray(v: unknown, field: string, errors: string[]): v is string[] {
|
||||
if (!Array.isArray(v)) {
|
||||
errors.push(`${field}: expected array, got ${typeof v}`);
|
||||
return false;
|
||||
}
|
||||
for (let i = 0; i < v.length; i++) {
|
||||
if (typeof v[i] !== "string") {
|
||||
errors.push(`${field}[${i}]: expected string, got ${typeof v[i]}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compute the canonical sha256 used for sig_hash. Sorts keys so the
|
||||
// hash is stable regardless of producer's serialization order. Uses
|
||||
// Bun.CryptoHasher (sync, fast) rather than node:crypto — matches the
|
||||
// rest of the auditor.
|
||||
export async function canonicalSha256(obj: unknown): Promise<string> {
|
||||
const ordered = orderKeys(obj);
|
||||
const json = JSON.stringify(ordered);
|
||||
const hasher = new Bun.CryptoHasher("sha256");
|
||||
hasher.update(json);
|
||||
return hasher.digest("hex");
|
||||
}
|
||||
|
||||
function orderKeys(v: unknown): unknown {
|
||||
if (v === null || typeof v !== "object") return v;
|
||||
if (Array.isArray(v)) return v.map(orderKeys);
|
||||
const out: Record<string, unknown> = {};
|
||||
for (const k of Object.keys(v as object).sort()) {
|
||||
out[k] = orderKeys((v as Record<string, unknown>)[k]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@ -2,7 +2,7 @@
|
||||
// if something can't be verified from a check, it goes into `evidence`
|
||||
// so the verdict is inspectable, not a black box.
|
||||
|
||||
export type CheckKind = "static" | "dynamic" | "inference" | "kb_query" | "kimi_architect";
|
||||
export type CheckKind = "static" | "dynamic" | "inference" | "kb_query";
|
||||
|
||||
export type Severity = "info" | "warn" | "block";
|
||||
|
||||
@ -18,14 +18,7 @@ export interface Claim {
|
||||
// Heuristic rating of how strong the claim is. "green+tested"
|
||||
// is strong; "should work" is weak. Drives sensitivity — stronger
|
||||
// claims get harder-blocked on weak evidence.
|
||||
//
|
||||
// "empirical" is a separate class: runtime measurements like
|
||||
// "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
|
||||
// These cannot be verified from a static diff — only from the test
|
||||
// output that produced them. Inference skips diff-verification for
|
||||
// empirical claims; they become info-level context unless a future
|
||||
// runtime_evidence check contradicts them.
|
||||
strength: "weak" | "moderate" | "strong" | "empirical";
|
||||
strength: "weak" | "moderate" | "strong";
|
||||
}
|
||||
|
||||
export interface Finding {
|
||||
|
||||
@ -13,17 +13,10 @@ import { readFile } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
import type { Gap, Proposal } from "./types.ts";
|
||||
|
||||
// Phase 44 migration (2026-04-27): bot/propose.ts now flows through
|
||||
// the gateway's /v1/chat instead of hitting the sidecar's /generate
|
||||
// directly. /v1/usage tracks the call, Langfuse traces it, observer
|
||||
// sees it. Gateway owns the routing.
|
||||
//
|
||||
// 2026-04-28: gpt-oss:120b → deepseek-v3.2 via Ollama Pro. Newer
|
||||
// DeepSeek revision, faster, still on the same OLLAMA_CLOUD_KEY.
|
||||
const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const SIDECAR_URL = process.env.LH_SIDECAR_URL ?? "http://localhost:3200";
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
|
||||
const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "deepseek-v3.2";
|
||||
const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
|
||||
const MAX_TOKENS = 6000;
|
||||
|
||||
export async function findGaps(): Promise<Gap[]> {
|
||||
@ -79,16 +72,13 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
|
||||
sections.push("Propose a small change that addresses this gap. Respond with the JSON object only.");
|
||||
const userPrompt = sections.join("\n");
|
||||
|
||||
const r = await fetch(`${GATEWAY_URL}/v1/chat`, {
|
||||
const r = await fetch(`${SIDECAR_URL}/generate`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: CLOUD_MODEL,
|
||||
provider: "ollama_cloud",
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{ role: "user", content: userPrompt },
|
||||
],
|
||||
system: SYSTEM_PROMPT,
|
||||
prompt: userPrompt,
|
||||
temperature: 0.2,
|
||||
max_tokens: MAX_TOKENS,
|
||||
think: false,
|
||||
@ -96,10 +86,10 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
|
||||
signal: AbortSignal.timeout(180000), // cloud T3 can be slow — 3 min
|
||||
});
|
||||
if (!r.ok) {
|
||||
throw new Error(`gateway /v1/chat ${r.status}: ${await r.text()}`);
|
||||
throw new Error(`sidecar ${r.status}: ${await r.text()}`);
|
||||
}
|
||||
const j = await r.json() as any;
|
||||
const raw: string = j?.choices?.[0]?.message?.content ?? "";
|
||||
const raw: string = j.text ?? j.response ?? "";
|
||||
const usage = j.usage ?? {};
|
||||
const tokens = (usage.prompt_tokens ?? 0) + (usage.completion_tokens ?? 0);
|
||||
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
# Mode router config — task_class → mode mapping
|
||||
#
|
||||
# `preferred_mode` is the first choice for a task class; `fallback_modes`
|
||||
# get tried in order if the preferred one isn't available (LLM Team can
|
||||
# return Unknown mode for some, OR the matrix has stronger signal for a
|
||||
# fallback). `default_model` seeds the mode runner's model field if the
|
||||
# caller doesn't override.
|
||||
#
|
||||
# Modes are dispatched against LLM Team UI (localhost:5000/api/run) for
|
||||
# now; future Rust-native runners will short-circuit before the proxy.
|
||||
# See crates/gateway/src/v1/mode.rs for the dispatch path.
|
||||
|
||||
[[task_class]]
|
||||
name = "scrum_review"
|
||||
# 2026-04-26 pass5 variance test (5 reps × 4 conditions, grok-4.1-fast,
|
||||
# pathway_memory.rs): composed corpus LOST 5/5 vs isolation (Δ −1.8
|
||||
# grounded findings, p=0.031). See docs/MODE_RUNNER_TUNING_PLAN.md.
|
||||
# Default is now isolation — bug fingerprints + adversarial framing +
|
||||
# file content carries strong models without matrix noise. The
|
||||
# `codereview_lakehouse` matrix path remains available via force_mode
|
||||
# (auto-downgrades to isolation on strong models — see the
|
||||
# is_strong_model gate in crates/gateway/src/v1/mode.rs).
|
||||
preferred_mode = "codereview_isolation"
|
||||
fallback_modes = ["codereview_lakehouse", "codereview", "consensus", "ladder"]
|
||||
default_model = "qwen3-coder:480b"
|
||||
# Corpora kept defined so experimental modes (codereview_matrix_only,
|
||||
# pass2/pass5 sweeps) and weak-model rescue rungs can still pull them.
|
||||
# scrum_findings_v1 is built but EXCLUDED — bake-off showed 24% OOB
|
||||
# line citations from cross-file drift, only safe with same-file gating.
|
||||
matrix_corpus = ["lakehouse_arch_v1", "lakehouse_symbols_v1"]
|
||||
|
||||
[[task_class]]
|
||||
name = "contract_analysis"
|
||||
preferred_mode = "deep_analysis"
|
||||
fallback_modes = ["research", "extract"]
|
||||
default_model = "kimi-k2:1t"
|
||||
matrix_corpus = "chicago_permits_v1"
|
||||
|
||||
[[task_class]]
|
||||
name = "staffing_inference"
|
||||
# Staffing-domain native enrichment runner — Pass 4 (2026-04-26).
|
||||
# Same composer architecture as codereview_lakehouse but with staffing
|
||||
# framing + workers corpus. Validates that the modes-as-prompt-molders
|
||||
# pattern generalizes beyond code review.
|
||||
preferred_mode = "staffing_inference_lakehouse"
|
||||
fallback_modes = ["ladder", "consensus", "pipeline"]
|
||||
# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding-
|
||||
# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so
|
||||
# no extra provider hop.
|
||||
default_model = "kimi-k2.6"
|
||||
matrix_corpus = "workers_500k_v8"
|
||||
|
||||
[[task_class]]
|
||||
name = "fact_extract"
|
||||
preferred_mode = "extract"
|
||||
fallback_modes = ["distill"]
|
||||
default_model = "qwen2.5"
|
||||
matrix_corpus = "kb_team_runs_v1"
|
||||
|
||||
[[task_class]]
|
||||
name = "doc_drift_check"
|
||||
preferred_mode = "drift"
|
||||
fallback_modes = ["validator"]
|
||||
# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro.
|
||||
# Speed leader on factual checking, same OLLAMA_CLOUD_KEY.
|
||||
default_model = "gemini-3-flash-preview"
|
||||
matrix_corpus = "distilled_factual_v20260423095819"
|
||||
|
||||
[[task_class]]
|
||||
name = "pr_audit"
|
||||
# Auditor's claim-vs-diff verification mode (2026-04-26 rebuild).
|
||||
# Replaces the auditor's hand-rolled inference check with the mode-runner
|
||||
# composer: pathway memory (PR-level patterns) + lakehouse_answers_v1
|
||||
# corpus (prior accepted reviews + observer escalations) + adversarial
|
||||
# JSON-shaped framing. Default model is paid Ollama Cloud kimi-k2:1t for
|
||||
# strong claim-grounding; tie-breaker via auditor-side env override.
|
||||
preferred_mode = "pr_audit"
|
||||
fallback_modes = ["consensus", "ladder"]
|
||||
# kimi-k2:1t broken upstream 2026-04-27 (Ollama Cloud 500 ISE, multi-hour
|
||||
# sustained outage verified by repeated probes). deepseek-v3.1:671b is
|
||||
# the drop-in substitute — proven working end-to-end through pr_audit
|
||||
# during Phase 5 distillation acceptance testing.
|
||||
default_model = "deepseek-v3.1:671b"
|
||||
matrix_corpus = "lakehouse_answers_v1"
|
||||
|
||||
# Fallback when task_class isn't in the table — useful for ad-hoc calls
|
||||
# during development that don't yet have a mapped mode.
|
||||
[default]
|
||||
preferred_mode = "pipeline"
|
||||
fallback_modes = ["consensus", "ladder"]
|
||||
default_model = "qwen3.5:latest"
|
||||
@ -1,106 +0,0 @@
|
||||
# Phase 39: Provider Registry
|
||||
#
|
||||
# Per-provider base_url, auth scheme, and default model. The gateway's
|
||||
# /v1/chat dispatcher reads this file at boot to populate its provider
|
||||
# table. Secrets (API keys) come from /etc/lakehouse/secrets.toml or
|
||||
# environment variables — NEVER inline a key here.
|
||||
#
|
||||
# Adding a new provider:
|
||||
# 1. New [[provider]] block with name, base_url, auth, default_model
|
||||
# 2. Matching adapter at crates/aibridge/src/providers/<name>.rs
|
||||
# implementing the ProviderAdapter trait (chat + embed + unload)
|
||||
# 3. Route arm in crates/gateway/src/v1/mod.rs matching on `name`
|
||||
# 4. Model-prefix routing hint in resolve_provider() if the provider
|
||||
# uses an "<name>/..." model prefix (e.g. "openrouter/...")
|
||||
|
||||
[[provider]]
|
||||
name = "ollama"
|
||||
base_url = "http://localhost:11434"
|
||||
auth = "none"
|
||||
default_model = "qwen3.5:latest"
|
||||
# Hot-path local inference. No bearer needed — direct to Ollama as of
|
||||
# 2026-05-02 (Python sidecar's pass-through wrapper retired). Model
|
||||
# names are bare (e.g. "qwen3.5:latest", not "ollama/qwen3.5:latest").
|
||||
|
||||
[[provider]]
|
||||
name = "ollama_cloud"
|
||||
base_url = "https://ollama.com"
|
||||
auth = "bearer"
|
||||
auth_env = "OLLAMA_CLOUD_KEY"
|
||||
default_model = "deepseek-v3.2"
|
||||
# Cloud-tier Ollama (Pro plan as of 2026-04-28). Key resolved from
|
||||
# OLLAMA_CLOUD_KEY at gateway boot; Pro tier upgraded the account so
|
||||
# rate limits + model access widen without a key change. Model-prefix
|
||||
# routing: "cloud/<model>" auto-routes here. 39-model fleet now
|
||||
# includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash-
|
||||
# preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next.
|
||||
# 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest
|
||||
# DeepSeek revision). NOTE: kimi-k2:1t is upstream-broken (HTTP 500
|
||||
# on Ollama Pro probe 2026-04-28) — do not route to it. Use kimi-k2.6
|
||||
# instead, which is what staffing_inference points at.
|
||||
|
||||
[[provider]]
|
||||
name = "openrouter"
|
||||
base_url = "https://openrouter.ai/api/v1"
|
||||
auth = "bearer"
|
||||
auth_env = "OPENROUTER_API_KEY"
|
||||
auth_fallback_files = ["/home/profit/.env", "/root/llm_team_config.json"]
|
||||
default_model = "x-ai/grok-4.1-fast"
|
||||
# Multi-provider gateway. Covers Anthropic, Google, OpenAI, MiniMax,
|
||||
# Qwen, Gemma, etc. Key resolved via crates/gateway/src/v1/openrouter.rs
|
||||
# resolve_openrouter_key() — env first, then fallback files.
|
||||
# Model-prefix routing: "openrouter/<vendor>/<model>" auto-routes here,
|
||||
# prefix stripped before upstream call.
|
||||
|
||||
[[provider]]
|
||||
name = "opencode"
|
||||
base_url = "https://opencode.ai/zen/v1"
|
||||
# Unified endpoint — covers BOTH Zen (pay-per-token Anthropic/OpenAI/
|
||||
# Gemini frontier) AND Go (flat-sub Kimi/GLM/DeepSeek/Qwen/Minimax).
|
||||
# Upstream bills per-model: Zen models hit Zen balance, Go models hit
|
||||
# Go subscription cap. /zen/go/v1 is the Go-only sub-path (rejects
|
||||
# Zen models), kept for reference but not used by this provider.
|
||||
auth = "bearer"
|
||||
auth_env = "OPENCODE_API_KEY"
|
||||
default_model = "claude-opus-4-7"
|
||||
# OpenCode (Zen + GO unified endpoint). One sk-* key reaches Claude
|
||||
# Opus 4.7, GPT-5.5-pro, Gemini 3.1-pro, Kimi K2.6, DeepSeek, GLM,
|
||||
# Qwen, plus 4 free-tier models. OpenAI-compatible Chat Completions
|
||||
# at /v1/chat/completions. Model-prefix routing: "opencode/<name>"
|
||||
# auto-routes here, prefix stripped before upstream call.
|
||||
# Key file: /etc/lakehouse/opencode.env (loaded via systemd EnvironmentFile).
|
||||
# Model catalog: curl -H "Authorization: Bearer ..." https://opencode.ai/zen/v1/models
|
||||
# Note: /zen/go/v1 is the GO-only sub-path (Kimi/GLM/DeepSeek tier);
|
||||
# /zen/v1 covers everything including Anthropic (which /zen/go/v1 rejects).
|
||||
|
||||
[[provider]]
|
||||
name = "kimi"
|
||||
base_url = "https://api.kimi.com/coding/v1"
|
||||
auth = "bearer"
|
||||
auth_env = "KIMI_API_KEY"
|
||||
default_model = "kimi-for-coding"
|
||||
# Direct Kimi For Coding provider. `api.kimi.com` is a SEPARATE account
|
||||
# system from `api.moonshot.ai` and `api.moonshot.cn` — keys are NOT
|
||||
# interchangeable. Used as a fallback when Ollama Cloud's kimi-k2.6 is
|
||||
# unavailable and OpenRouter's `moonshotai/kimi-k2.6` is rate-limited.
|
||||
# (Was `kimi-k2:1t` here pre-2026-05-03 — that model is upstream-broken
|
||||
# and removed from operator guidance.)
|
||||
# Model id: `kimi-for-coding` (kimi-k2.6 underneath).
|
||||
# Key file: /etc/lakehouse/kimi.env (loaded via systemd EnvironmentFile).
|
||||
# Model-prefix routing: "kimi/<model>" auto-routes here, prefix stripped.
|
||||
|
||||
# Planned (Phase 40 long-horizon — adapters not yet shipped):
|
||||
#
|
||||
# [[provider]]
|
||||
# name = "gemini"
|
||||
# base_url = "https://generativelanguage.googleapis.com/v1beta"
|
||||
# auth = "api_key_query"
|
||||
# auth_env = "GEMINI_API_KEY"
|
||||
# default_model = "gemini-2.0-flash"
|
||||
#
|
||||
# [[provider]]
|
||||
# name = "claude"
|
||||
# base_url = "https://api.anthropic.com/v1"
|
||||
# auth = "x_api_key"
|
||||
# auth_env = "ANTHROPIC_API_KEY"
|
||||
# default_model = "claude-3-5-sonnet-latest"
|
||||
@ -1,61 +0,0 @@
|
||||
# Phase 40: Routing Engine Configuration
|
||||
#
|
||||
# Human-editable rules for model → provider routing.
|
||||
# Matching order: first match wins.
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "gpt-4*"
|
||||
provider = "openrouter"
|
||||
max_tokens = 4096
|
||||
temperature = 0.7
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "claude*"
|
||||
provider = "claude"
|
||||
max_tokens = 4096
|
||||
temperature = 0.7
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "gemini*"
|
||||
provider = "gemini"
|
||||
max_tokens = 8192
|
||||
temperature = 0.9
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "qwen3.5*"
|
||||
provider = "ollama"
|
||||
max_tokens = 4096
|
||||
temperature = 0.3
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "qwen3*"
|
||||
provider = "ollama"
|
||||
max_tokens = 2048
|
||||
temperature = 0.3
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "gpt-oss*"
|
||||
provider = "ollama"
|
||||
temperature = 0.1
|
||||
|
||||
[[rule]]
|
||||
model_pattern = "*"
|
||||
provider = "ollama"
|
||||
temperature = 0.5
|
||||
|
||||
# Fallback chain: if primary fails, try these in order
|
||||
fallback = ["ollama", "openrouter"]
|
||||
|
||||
# Cost gating (tokens = cents per 1M)
|
||||
[cost]
|
||||
ollama = 0
|
||||
openrouter = 15
|
||||
claude = 15
|
||||
gemini = 0
|
||||
|
||||
# Daily budget per provider (cents)
|
||||
[daily_budget]
|
||||
ollama = 0
|
||||
openrouter = 1000
|
||||
claude = 500
|
||||
gemini = 0
|
||||
@ -11,5 +11,3 @@ serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
async-trait = "0.1"
|
||||
lru = "0.12"
|
||||
|
||||
@ -1,74 +1,12 @@
|
||||
use lru::LruCache;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
/// HTTP client for Ollama (post-2026-05-02 — sidecar dropped).
|
||||
///
|
||||
/// `base_url` was historically the Python sidecar at `:3200`, which
|
||||
/// pass-through-proxied to Ollama at `:11434`. The sidecar added zero
|
||||
/// logic on the hot path (embed.py + generate.py + rerank.py +
|
||||
/// admin.py = ~120 LOC of pure Ollama wrappers), so this client now
|
||||
/// talks to Ollama directly and the sidecar process can be retired.
|
||||
///
|
||||
/// What stayed Python: `lab_ui.py` + `pipeline_lab.py` (~888 LOC of
|
||||
/// dev-mode Streamlit-shape UIs) — those aren't on the runtime hot
|
||||
/// path and continue running for prompt experimentation.
|
||||
///
|
||||
/// `generate()` has two transport modes:
|
||||
/// - When `gateway_url` is None (default), posts directly to Ollama's
|
||||
/// `${base_url}/api/generate`.
|
||||
/// - When `gateway_url` is `Some(url)`, posts to `${url}/v1/chat`
|
||||
/// with `provider="ollama"` so the call appears in `/v1/usage` and
|
||||
/// Langfuse traces.
|
||||
///
|
||||
/// `embed()`, `rerank()`, and admin methods always go direct to
|
||||
/// Ollama — no `/v1` equivalent for those surfaces yet.
|
||||
///
|
||||
/// Phase 44 part 2 (2026-04-27): the gateway URL is wired in by
|
||||
/// callers that want observability (vectord modules); it's left
|
||||
/// unset by callers that ARE the gateway internals (avoids self-loops
|
||||
/// + redundant hops).
|
||||
/// Per-text embed cache key. We key on (model, text) so different
|
||||
/// model selections produce distinct cache lines — a query embedded
|
||||
/// under nomic-embed-text-v2-moe must NOT collide with the same
|
||||
/// query under nomic-embed-text v1.
|
||||
#[derive(Eq, PartialEq, Hash, Clone)]
|
||||
struct EmbedCacheKey {
|
||||
model: String,
|
||||
text: String,
|
||||
}
|
||||
|
||||
/// Default LRU cache size — 4096 entries × ~6KB per 768-d f64
|
||||
/// vector ≈ 24MB. Sized for typical staffing-domain repetition
|
||||
/// (coordinator workflows have query repetition rates around 70-90%
|
||||
/// per session). Tunable via [aibridge].embed_cache_size in the
|
||||
/// config; 0 disables the cache entirely.
|
||||
const DEFAULT_EMBED_CACHE_SIZE: usize = 4096;
|
||||
|
||||
/// HTTP client for the Python AI sidecar.
|
||||
#[derive(Clone)]
|
||||
pub struct AiClient {
|
||||
client: Client,
|
||||
base_url: String,
|
||||
gateway_url: Option<String>,
|
||||
/// Closes the 63× perf gap with Go side. Mirrors the shape of
|
||||
/// Go's internal/embed/cached.go::CachedProvider — same
|
||||
/// (model, text) → vector caching, same nil-disable semantics.
|
||||
/// None = caching disabled (cache_size=0); Some = bounded LRU.
|
||||
embed_cache: Option<Arc<Mutex<LruCache<EmbedCacheKey, Vec<f64>>>>>,
|
||||
/// Hit / miss counters for /admin observability + load-test
|
||||
/// validation. Atomic so Clone'd AiClients share the same counts.
|
||||
embed_cache_hits: Arc<AtomicU64>,
|
||||
embed_cache_misses: Arc<AtomicU64>,
|
||||
/// Pinned at construction time so the EmbedResponse can carry
|
||||
/// dimension consistently even when every text was a cache hit
|
||||
/// (no fresh upstream call to learn the dim from). Set on first
|
||||
/// successful Ollama embed; checked on every cache hit.
|
||||
cached_dim: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
// -- Request/Response types --
|
||||
@ -141,386 +79,68 @@ pub struct RerankResponse {
|
||||
|
||||
impl AiClient {
|
||||
pub fn new(base_url: &str) -> Self {
|
||||
Self::with_embed_cache(base_url, DEFAULT_EMBED_CACHE_SIZE)
|
||||
}
|
||||
|
||||
/// Constructs an AiClient with an explicit embed-cache size.
|
||||
/// Pass 0 to disable the cache entirely (matches Go-side
|
||||
/// CachedProvider's nil-cache semantics).
|
||||
pub fn with_embed_cache(base_url: &str, cache_size: usize) -> Self {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(120))
|
||||
.build()
|
||||
.expect("failed to build HTTP client");
|
||||
let embed_cache = if cache_size > 0 {
|
||||
// SAFETY: cache_size > 0 just verified, NonZeroUsize::new
|
||||
// returns Some.
|
||||
let cap = NonZeroUsize::new(cache_size).expect("cache_size > 0");
|
||||
Some(Arc::new(Mutex::new(LruCache::new(cap))))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Self {
|
||||
client,
|
||||
base_url: base_url.trim_end_matches('/').to_string(),
|
||||
gateway_url: None,
|
||||
embed_cache,
|
||||
embed_cache_hits: Arc::new(AtomicU64::new(0)),
|
||||
embed_cache_misses: Arc::new(AtomicU64::new(0)),
|
||||
cached_dim: Arc::new(AtomicU64::new(0)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Cache hit/miss/size snapshot. Useful for /admin endpoints +
|
||||
/// load-test validation ("did the cache fire as expected?").
|
||||
pub fn embed_cache_stats(&self) -> (u64, u64, usize) {
|
||||
let hits = self.embed_cache_hits.load(Ordering::Relaxed);
|
||||
let misses = self.embed_cache_misses.load(Ordering::Relaxed);
|
||||
let len = self
|
||||
.embed_cache
|
||||
.as_ref()
|
||||
.map(|c| c.lock().map(|g| g.len()).unwrap_or(0))
|
||||
.unwrap_or(0);
|
||||
(hits, misses, len)
|
||||
}
|
||||
|
||||
/// Same as `new`, but every `generate()` is routed through
|
||||
/// `${gateway_url}/v1/chat` (provider=ollama) for observability.
|
||||
/// Use this for callers OUTSIDE the gateway. Inside the gateway
|
||||
/// itself, prefer `new()` — calling /v1/chat from /v1/chat works
|
||||
/// (no infinite loop, ollama_arm doesn't use AiClient) but adds
|
||||
/// a wasted localhost hop.
|
||||
pub fn new_with_gateway(base_url: &str, gateway_url: &str) -> Self {
|
||||
let mut c = Self::new(base_url);
|
||||
c.gateway_url = Some(gateway_url.trim_end_matches('/').to_string());
|
||||
c
|
||||
}
|
||||
|
||||
/// Reachability + version check. Hits Ollama's `/api/version`,
|
||||
/// returns a sidecar-shaped envelope so callers reading
|
||||
/// `.status` / `.ollama_url` don't break across the
|
||||
/// pre-/post-2026-05-02 cutover.
|
||||
pub async fn health(&self) -> Result<serde_json::Value, String> {
|
||||
let resp = self.client
|
||||
.get(format!("{}/api/version", self.base_url))
|
||||
.get(format!("{}/health", self.base_url))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("ollama unreachable: {e}"))?;
|
||||
let body: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("invalid response: {e}"))?;
|
||||
Ok(serde_json::json!({
|
||||
"status": "ok",
|
||||
"ollama_url": &self.base_url,
|
||||
"ollama_version": body.get("version"),
|
||||
}))
|
||||
.map_err(|e| format!("sidecar unreachable: {e}"))?;
|
||||
resp.json().await.map_err(|e| format!("invalid response: {e}"))
|
||||
}
|
||||
|
||||
/// Embed with per-text LRU caching. Mirrors Go-side
|
||||
/// CachedProvider behavior: cache key is (model, text);
|
||||
/// cache-hit texts skip the sidecar; cache-miss texts batch
|
||||
/// into a single sidecar call; results are interleaved in the
|
||||
/// caller's input order.
|
||||
///
|
||||
/// Closes ~95% of the load-test perf gap vs Go side (loadgen
|
||||
/// 2026-05-01: Rust 128 RPS → with cache ≥ 7000 RPS expected
|
||||
/// for warm-cache workloads). Cold-cache behavior unchanged
|
||||
/// (every text is a miss → single sidecar call, identical to
|
||||
/// pre-cache).
|
||||
pub async fn embed(&self, req: EmbedRequest) -> Result<EmbedResponse, String> {
|
||||
let model_key = req.model.clone().unwrap_or_default();
|
||||
let resp = self.client
|
||||
.post(format!("{}/embed", self.base_url))
|
||||
.json(&req)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("embed request failed: {e}"))?;
|
||||
|
||||
// Fast path: cache disabled → original behavior.
|
||||
let Some(cache) = self.embed_cache.as_ref() else {
|
||||
return self.embed_uncached(&req).await;
|
||||
};
|
||||
if req.texts.is_empty() {
|
||||
return self.embed_uncached(&req).await;
|
||||
if !resp.status().is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("embed error ({}): {text}", text.len()));
|
||||
}
|
||||
|
||||
// First pass: check cache for each text. Track which positions
|
||||
// need a sidecar fetch.
|
||||
let mut embeddings: Vec<Option<Vec<f64>>> = vec![None; req.texts.len()];
|
||||
let mut miss_indices: Vec<usize> = Vec::new();
|
||||
let mut miss_texts: Vec<String> = Vec::new();
|
||||
{
|
||||
let mut guard = cache.lock().map_err(|e| format!("cache lock poisoned: {e}"))?;
|
||||
for (i, text) in req.texts.iter().enumerate() {
|
||||
let key = EmbedCacheKey { model: model_key.clone(), text: text.clone() };
|
||||
if let Some(vec) = guard.get(&key) {
|
||||
embeddings[i] = Some(vec.clone());
|
||||
self.embed_cache_hits.fetch_add(1, Ordering::Relaxed);
|
||||
} else {
|
||||
miss_indices.push(i);
|
||||
miss_texts.push(text.clone());
|
||||
self.embed_cache_misses.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All hit? Return immediately. Use cached_dim to populate
|
||||
// the response dimension (no sidecar to ask).
|
||||
if miss_indices.is_empty() {
|
||||
let dim = self.cached_dim.load(Ordering::Relaxed) as usize;
|
||||
let dim = if dim == 0 { embeddings[0].as_ref().map(|v| v.len()).unwrap_or(0) } else { dim };
|
||||
return Ok(EmbedResponse {
|
||||
embeddings: embeddings.into_iter().map(|opt| opt.expect("filled")).collect(),
|
||||
model: req.model.unwrap_or_else(|| "nomic-embed-text".to_string()),
|
||||
dimensions: dim,
|
||||
});
|
||||
}
|
||||
|
||||
// Second pass: fetch the misses in one sidecar call.
|
||||
let miss_req = EmbedRequest { texts: miss_texts.clone(), model: req.model.clone() };
|
||||
let resp = self.embed_uncached(&miss_req).await?;
|
||||
if resp.embeddings.len() != miss_texts.len() {
|
||||
return Err(format!(
|
||||
"embed cache: sidecar returned {} embeddings for {} texts",
|
||||
resp.embeddings.len(),
|
||||
miss_texts.len()
|
||||
));
|
||||
}
|
||||
|
||||
// Pin cached_dim on first successful response.
|
||||
if resp.dimensions > 0 {
|
||||
self.cached_dim.store(resp.dimensions as u64, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
// Insert misses into cache + fill response slots.
|
||||
{
|
||||
let mut guard = cache.lock().map_err(|e| format!("cache lock poisoned: {e}"))?;
|
||||
for (j, idx) in miss_indices.iter().enumerate() {
|
||||
let key = EmbedCacheKey {
|
||||
model: model_key.clone(),
|
||||
text: miss_texts[j].clone(),
|
||||
};
|
||||
let vec = resp.embeddings[j].clone();
|
||||
guard.put(key, vec.clone());
|
||||
embeddings[*idx] = Some(vec);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(EmbedResponse {
|
||||
embeddings: embeddings.into_iter().map(|opt| opt.expect("filled")).collect(),
|
||||
model: resp.model,
|
||||
dimensions: resp.dimensions,
|
||||
})
|
||||
}
|
||||
|
||||
/// Direct Ollama call — used internally by embed() for cache-miss
|
||||
/// batches and as the transparent fallback when the cache is
|
||||
/// disabled. Loops per-text against `${base_url}/api/embed`,
|
||||
/// matching the sidecar's pre-2026-05-02 behavior. Ollama 0.4+
|
||||
/// supports batch input but per-text keeps compatibility broader
|
||||
/// + lets cache-miss-only batches share the loop with cold runs.
|
||||
async fn embed_uncached(&self, req: &EmbedRequest) -> Result<EmbedResponse, String> {
|
||||
let model = req.model.clone().unwrap_or_else(|| "nomic-embed-text".to_string());
|
||||
let mut embeddings: Vec<Vec<f64>> = Vec::with_capacity(req.texts.len());
|
||||
|
||||
for text in &req.texts {
|
||||
let resp = self.client
|
||||
.post(format!("{}/api/embed", self.base_url))
|
||||
.json(&serde_json::json!({
|
||||
"model": &model,
|
||||
"input": text,
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("embed request failed: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ollama embed error: {body}"));
|
||||
}
|
||||
// Ollama returns {"embeddings": [[...]], "model": "...", ...}.
|
||||
// The outer `embeddings` is always a list; for a scalar input
|
||||
// we get a single inner vector.
|
||||
let parsed: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("embed parse error: {e}"))?;
|
||||
let arr = parsed.get("embeddings")
|
||||
.and_then(|v| v.as_array())
|
||||
.ok_or_else(|| format!("ollama embed: missing 'embeddings' field in {parsed}"))?;
|
||||
if arr.is_empty() {
|
||||
return Err("ollama embed: empty embeddings array".to_string());
|
||||
}
|
||||
let first = arr[0].as_array()
|
||||
.ok_or_else(|| "ollama embed: embeddings[0] not an array".to_string())?;
|
||||
let vec: Vec<f64> = first.iter()
|
||||
.filter_map(|n| n.as_f64())
|
||||
.collect();
|
||||
if vec.is_empty() {
|
||||
return Err("ollama embed: numeric coercion produced empty vector".to_string());
|
||||
}
|
||||
embeddings.push(vec);
|
||||
}
|
||||
|
||||
let dimensions = embeddings.first().map(|v| v.len()).unwrap_or(0);
|
||||
Ok(EmbedResponse {
|
||||
embeddings,
|
||||
model,
|
||||
dimensions,
|
||||
})
|
||||
resp.json().await.map_err(|e| format!("embed parse error: {e}"))
|
||||
}
|
||||
|
||||
pub async fn generate(&self, req: GenerateRequest) -> Result<GenerateResponse, String> {
|
||||
if let Some(gw) = self.gateway_url.as_deref() {
|
||||
return self.generate_via_gateway(gw, req).await;
|
||||
}
|
||||
// Direct Ollama path. Used by gateway internals (so the ollama
|
||||
// provider can call upstream without a self-loop through
|
||||
// /v1/chat) and by any consumer that wants raw transport
|
||||
// without /v1/usage accounting.
|
||||
let model = req.model.clone().unwrap_or_else(|| "qwen3.5:latest".to_string());
|
||||
let mut body = serde_json::json!({
|
||||
"model": &model,
|
||||
"prompt": &req.prompt,
|
||||
"stream": false,
|
||||
});
|
||||
let mut options = serde_json::Map::new();
|
||||
if let Some(t) = req.temperature {
|
||||
options.insert("temperature".to_string(), serde_json::json!(t));
|
||||
}
|
||||
if let Some(mt) = req.max_tokens {
|
||||
options.insert("num_predict".to_string(), serde_json::json!(mt));
|
||||
}
|
||||
if !options.is_empty() {
|
||||
body["options"] = serde_json::Value::Object(options);
|
||||
}
|
||||
if let Some(sys) = &req.system {
|
||||
body["system"] = serde_json::json!(sys);
|
||||
}
|
||||
if let Some(th) = req.think {
|
||||
body["think"] = serde_json::json!(th);
|
||||
}
|
||||
|
||||
let resp = self.client
|
||||
.post(format!("{}/api/generate", self.base_url))
|
||||
.json(&body)
|
||||
.post(format!("{}/generate", self.base_url))
|
||||
.json(&req)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("generate request failed: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ollama generate error: {text}"));
|
||||
return Err(format!("generate error: {text}"));
|
||||
}
|
||||
let parsed: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("generate parse error: {e}"))?;
|
||||
|
||||
Ok(GenerateResponse {
|
||||
text: parsed.get("response").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
model,
|
||||
tokens_evaluated: parsed.get("prompt_eval_count").and_then(|v| v.as_u64()),
|
||||
tokens_generated: parsed.get("eval_count").and_then(|v| v.as_u64()),
|
||||
})
|
||||
resp.json().await.map_err(|e| format!("generate parse error: {e}"))
|
||||
}
|
||||
|
||||
/// Phase 44 part 2: route generate() through the gateway's
|
||||
/// /v1/chat with provider="ollama" so the call lands in
|
||||
/// /v1/usage + Langfuse. Translates between the sidecar
|
||||
/// GenerateRequest/Response shape and the OpenAI-compat
|
||||
/// chat shape on the wire.
|
||||
async fn generate_via_gateway(&self, gateway_url: &str, req: GenerateRequest) -> Result<GenerateResponse, String> {
|
||||
let mut messages = Vec::with_capacity(2);
|
||||
if let Some(sys) = &req.system {
|
||||
messages.push(serde_json::json!({"role": "system", "content": sys}));
|
||||
}
|
||||
messages.push(serde_json::json!({"role": "user", "content": req.prompt}));
|
||||
let mut body = serde_json::json!({
|
||||
"messages": messages,
|
||||
"provider": "ollama",
|
||||
});
|
||||
if let Some(m) = &req.model { body["model"] = serde_json::json!(m); }
|
||||
if let Some(t) = req.temperature { body["temperature"] = serde_json::json!(t); }
|
||||
if let Some(mt) = req.max_tokens { body["max_tokens"] = serde_json::json!(mt); }
|
||||
if let Some(th) = req.think { body["think"] = serde_json::json!(th); }
|
||||
|
||||
pub async fn rerank(&self, req: RerankRequest) -> Result<RerankResponse, String> {
|
||||
let resp = self.client
|
||||
.post(format!("{}/v1/chat", gateway_url))
|
||||
.json(&body)
|
||||
.post(format!("{}/rerank", self.base_url))
|
||||
.json(&req)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("/v1/chat request failed: {e}"))?;
|
||||
.map_err(|e| format!("rerank request failed: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("/v1/chat error: {text}"));
|
||||
return Err(format!("rerank error: {text}"));
|
||||
}
|
||||
let parsed: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("/v1/chat parse error: {e}"))?;
|
||||
|
||||
let text = parsed
|
||||
.pointer("/choices/0/message/content")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
let model = parsed.get("model")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or_else(|| req.model.as_deref().unwrap_or(""))
|
||||
.to_string();
|
||||
let prompt_tokens = parsed.pointer("/usage/prompt_tokens").and_then(|v| v.as_u64());
|
||||
let completion_tokens = parsed.pointer("/usage/completion_tokens").and_then(|v| v.as_u64());
|
||||
|
||||
Ok(GenerateResponse {
|
||||
text,
|
||||
model,
|
||||
tokens_evaluated: prompt_tokens,
|
||||
tokens_generated: completion_tokens,
|
||||
})
|
||||
}
|
||||
|
||||
/// Cross-encoder reranking via Ollama generate. Asks the model to
|
||||
/// rate each document's relevance to the query 0-10, then sorts
|
||||
/// descending. Mirrors the sidecar's pre-2026-05-02 algorithm
|
||||
/// exactly so callers see the same scores.
|
||||
pub async fn rerank(&self, req: RerankRequest) -> Result<RerankResponse, String> {
|
||||
let model = req.model.clone().unwrap_or_else(|| "qwen3.5:latest".to_string());
|
||||
let mut scored: Vec<ScoredDocument> = Vec::with_capacity(req.documents.len());
|
||||
|
||||
for (i, doc) in req.documents.iter().enumerate() {
|
||||
let prompt = format!(
|
||||
"Rate the relevance of the following document to the query on a scale of 0 to 10. \
|
||||
Respond with ONLY a number.\n\n\
|
||||
Query: {}\n\n\
|
||||
Document: {}\n\n\
|
||||
Score:",
|
||||
req.query, doc,
|
||||
);
|
||||
let resp = self.client
|
||||
.post(format!("{}/api/generate", self.base_url))
|
||||
.json(&serde_json::json!({
|
||||
"model": &model,
|
||||
"prompt": prompt,
|
||||
"stream": false,
|
||||
"options": {"temperature": 0.0, "num_predict": 8},
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("rerank request failed: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ollama rerank error: {body}"));
|
||||
}
|
||||
let parsed: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("rerank parse error: {e}"))?;
|
||||
let text = parsed.get("response").and_then(|v| v.as_str()).unwrap_or("").trim();
|
||||
// Parse the leading number; tolerate "7", "7.5", "7 — strong match".
|
||||
let score = text.split_whitespace().next()
|
||||
.and_then(|t| t.parse::<f64>().ok())
|
||||
.unwrap_or(0.0)
|
||||
.clamp(0.0, 10.0);
|
||||
|
||||
scored.push(ScoredDocument {
|
||||
index: i,
|
||||
text: doc.clone(),
|
||||
score,
|
||||
});
|
||||
}
|
||||
|
||||
scored.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
|
||||
if let Some(k) = req.top_k {
|
||||
scored.truncate(k);
|
||||
}
|
||||
Ok(RerankResponse { results: scored, model })
|
||||
resp.json().await.map_err(|e| format!("rerank parse error: {e}"))
|
||||
}
|
||||
|
||||
/// Force Ollama to unload the named model from VRAM (keep_alive=0).
|
||||
@ -529,116 +149,40 @@ impl AiClient {
|
||||
/// profile's model can linger in VRAM next to the new one.
|
||||
pub async fn unload_model(&self, model: &str) -> Result<serde_json::Value, String> {
|
||||
let resp = self.client
|
||||
.post(format!("{}/api/generate", self.base_url))
|
||||
.json(&serde_json::json!({
|
||||
"model": model,
|
||||
"prompt": "",
|
||||
"keep_alive": 0,
|
||||
"stream": false,
|
||||
}))
|
||||
.post(format!("{}/admin/unload", self.base_url))
|
||||
.json(&serde_json::json!({ "model": model }))
|
||||
.send().await
|
||||
.map_err(|e| format!("unload request failed: {e}"))?;
|
||||
if !resp.status().is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ollama unload error: {text}"));
|
||||
return Err(format!("unload error: {text}"));
|
||||
}
|
||||
// Ollama returns 200 with the empty-prompt response shape.
|
||||
// Fold into the legacy {"unloaded": "<model>"} envelope so
|
||||
// callers' parsing doesn't break.
|
||||
Ok(serde_json::json!({ "unloaded": model }))
|
||||
resp.json().await.map_err(|e| format!("unload parse error: {e}"))
|
||||
}
|
||||
|
||||
/// Ask Ollama to load the named model into VRAM proactively. Makes
|
||||
/// the first real request after profile activation fast (no cold-load
|
||||
/// latency). Empty prompts confuse some models, so we send a single
|
||||
/// space + cap num_predict=1 (matches the sidecar's prior behavior).
|
||||
/// latency).
|
||||
pub async fn preload_model(&self, model: &str) -> Result<serde_json::Value, String> {
|
||||
let resp = self.client
|
||||
.post(format!("{}/api/generate", self.base_url))
|
||||
.json(&serde_json::json!({
|
||||
"model": model,
|
||||
"prompt": " ",
|
||||
"keep_alive": "5m",
|
||||
"stream": false,
|
||||
"options": {"num_predict": 1},
|
||||
}))
|
||||
.post(format!("{}/admin/preload", self.base_url))
|
||||
.json(&serde_json::json!({ "model": model }))
|
||||
.send().await
|
||||
.map_err(|e| format!("preload request failed: {e}"))?;
|
||||
if !resp.status().is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ollama preload error: {text}"));
|
||||
return Err(format!("preload error: {text}"));
|
||||
}
|
||||
let parsed: serde_json::Value = resp.json().await
|
||||
.map_err(|e| format!("preload parse error: {e}"))?;
|
||||
Ok(serde_json::json!({
|
||||
"preloaded": model,
|
||||
"load_duration_ns": parsed.get("load_duration"),
|
||||
"total_duration_ns": parsed.get("total_duration"),
|
||||
}))
|
||||
resp.json().await.map_err(|e| format!("preload parse error: {e}"))
|
||||
}
|
||||
|
||||
/// GPU + loaded-model snapshot. Combines nvidia-smi output (when
|
||||
/// available) with Ollama's /api/ps. Same shape as the prior
|
||||
/// sidecar /admin/vram endpoint so callers don't need updating.
|
||||
/// GPU + loaded-model snapshot from the sidecar. Combines nvidia-smi
|
||||
/// output (if available) with Ollama's /api/ps.
|
||||
pub async fn vram_snapshot(&self) -> Result<serde_json::Value, String> {
|
||||
let resp = self.client
|
||||
.get(format!("{}/api/ps", self.base_url))
|
||||
.get(format!("{}/admin/vram", self.base_url))
|
||||
.send().await
|
||||
.map_err(|e| format!("ollama ps request failed: {e}"))?;
|
||||
let loaded: Vec<serde_json::Value> = if resp.status().is_success() {
|
||||
let parsed: serde_json::Value = resp.json().await.unwrap_or(serde_json::Value::Null);
|
||||
parsed.get("models")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|arr| arr.iter().map(|m| serde_json::json!({
|
||||
"name": m.get("name"),
|
||||
"size_vram_mib": m.get("size_vram").and_then(|v| v.as_u64()).map(|n| n / (1024 * 1024)),
|
||||
"expires_at": m.get("expires_at"),
|
||||
})).collect())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let gpu = nvidia_smi_snapshot();
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"gpu": gpu,
|
||||
"ollama_loaded": loaded,
|
||||
}))
|
||||
.map_err(|e| format!("vram request failed: {e}"))?;
|
||||
resp.json().await.map_err(|e| format!("vram parse error: {e}"))
|
||||
}
|
||||
}
|
||||
|
||||
/// One-shot nvidia-smi poll. Returns Null if the tool isn't on PATH
|
||||
/// or the call fails. Mirrors the sidecar's `_nvidia_smi_snapshot`
|
||||
/// shape exactly so callers reading vram_snapshot don't break.
|
||||
fn nvidia_smi_snapshot() -> serde_json::Value {
|
||||
use std::process::Command;
|
||||
let out = Command::new("nvidia-smi")
|
||||
.args([
|
||||
"--query-gpu=memory.used,memory.total,utilization.gpu,name",
|
||||
"--format=csv,noheader,nounits",
|
||||
])
|
||||
.output();
|
||||
let stdout = match out {
|
||||
Ok(o) if o.status.success() => o.stdout,
|
||||
_ => return serde_json::Value::Null,
|
||||
};
|
||||
let line = String::from_utf8_lossy(&stdout);
|
||||
let line = line.trim();
|
||||
if line.is_empty() {
|
||||
return serde_json::Value::Null;
|
||||
}
|
||||
let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
|
||||
if parts.len() < 4 {
|
||||
return serde_json::Value::Null;
|
||||
}
|
||||
let used = parts[0].parse::<u64>().unwrap_or(0);
|
||||
let total = parts[1].parse::<u64>().unwrap_or(0);
|
||||
let util = parts[2].parse::<u64>().unwrap_or(0);
|
||||
serde_json::json!({
|
||||
"name": parts[3],
|
||||
"used_mib": used,
|
||||
"total_mib": total,
|
||||
"utilization_pct": util,
|
||||
})
|
||||
}
|
||||
|
||||
@ -13,9 +13,11 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
// `estimate_tokens` moved to `shared::model_matrix::ModelMatrix::estimate_tokens`
|
||||
// (cdc24d8). All callers migrated; the deprecated wrapper that stood in its
|
||||
// place has been removed since it had zero external consumers.
|
||||
/// Rough token count. `chars / 4` ceiling. See module docs for why
|
||||
/// this heuristic is sufficient.
|
||||
pub fn estimate_tokens(text: &str) -> usize {
|
||||
(text.chars().count() + 3) / 4
|
||||
}
|
||||
|
||||
/// Phase 21 — per-model context windows, mirroring the TS table in
|
||||
/// `tests/multi-agent/agent.ts`. Anchored on each model's documented
|
||||
@ -82,8 +84,8 @@ pub fn assert_context_budget(
|
||||
let window = context_window_for(model);
|
||||
let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN);
|
||||
let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS);
|
||||
let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0);
|
||||
let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens;
|
||||
let sys_tokens = opts.system.map(estimate_tokens).unwrap_or(0);
|
||||
let estimated = estimate_tokens(prompt) + sys_tokens + max_tokens;
|
||||
let remaining = window as i64 - estimated as i64 - safety as i64;
|
||||
let check = BudgetCheck { estimated, window, remaining };
|
||||
if remaining < 0 && !opts.bypass {
|
||||
@ -107,10 +109,14 @@ pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Deprecated-function behavior is now canonically tested in
|
||||
// crates/shared/src/model_matrix.rs. This test was the legacy
|
||||
// pin that preceded the migration; delete when the deprecated
|
||||
// wrapper itself goes (see the #[deprecated] attribute).
|
||||
#[test]
|
||||
fn estimate_tokens_ceiling_divides_by_four() {
|
||||
assert_eq!(estimate_tokens(""), 0);
|
||||
assert_eq!(estimate_tokens("abc"), 1); // 3 → ceil(3/4) = 1
|
||||
assert_eq!(estimate_tokens("abcd"), 1); // 4 → ceil(4/4) = 1
|
||||
assert_eq!(estimate_tokens("abcde"), 2); // 5 → ceil(5/4) = 2
|
||||
assert_eq!(estimate_tokens(&"x".repeat(400)), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn context_window_known_and_fallback() {
|
||||
@ -173,7 +179,7 @@ mod tests {
|
||||
).unwrap();
|
||||
assert!(with_sys.estimated > without_sys.estimated,
|
||||
"system prompt should raise estimate");
|
||||
assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys));
|
||||
assert_eq!(with_sys.estimated - without_sys.estimated, estimate_tokens(&sys));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@ -138,17 +138,6 @@ pub struct ContinuableOutcome {
|
||||
pub empty_retries: usize,
|
||||
pub continuations: usize,
|
||||
pub final_complete: bool,
|
||||
/// Sum of `prompt_tokens` across every generator call made to
|
||||
/// produce this outcome — including empty retries and continuations.
|
||||
/// Lets callers (gateway execution loop, observability) stamp
|
||||
/// accurate per-task usage without second-guessing the retry fan-out.
|
||||
pub prompt_tokens: u32,
|
||||
/// Sum of `completion_tokens` across every generator call.
|
||||
pub completion_tokens: u32,
|
||||
/// Total number of generator calls. `1 + empty_retries +
|
||||
/// continuations` in the normal case; the field is explicit so
|
||||
/// callers don't have to re-derive it.
|
||||
pub calls: u32,
|
||||
}
|
||||
|
||||
fn make_request(opts: &ContinuableOpts, prompt: String, current_max: u32) -> GenerateRequest {
|
||||
@ -186,20 +175,11 @@ pub async fn generate_continuable<G: TextGenerator>(
|
||||
let mut combined = String::new();
|
||||
let mut empty_retries = 0usize;
|
||||
let mut continuations = 0usize;
|
||||
let mut prompt_tokens: u32 = 0;
|
||||
let mut completion_tokens: u32 = 0;
|
||||
let mut calls: u32 = 0;
|
||||
|
||||
// Phase 21(a) — empty-response backoff loop.
|
||||
for retry in 0..opts.max_empty_retries {
|
||||
let req = make_request(opts, prompt.to_string(), current_max);
|
||||
let resp = generator.generate_text(req).await?;
|
||||
calls += 1;
|
||||
// u32::try_from saturates at u32::MAX instead of silently
|
||||
// truncating bits when tokens_evaluated/_generated comes back
|
||||
// as a u64 > 4 billion. Caught 2026-04-27 by Opus self-audit.
|
||||
prompt_tokens = prompt_tokens.saturating_add(u32::try_from(resp.tokens_evaluated.unwrap_or(0)).unwrap_or(u32::MAX));
|
||||
completion_tokens = completion_tokens.saturating_add(u32::try_from(resp.tokens_generated.unwrap_or(0)).unwrap_or(u32::MAX));
|
||||
if !resp.text.trim().is_empty() {
|
||||
combined = resp.text;
|
||||
break;
|
||||
@ -208,7 +188,9 @@ pub async fn generate_continuable<G: TextGenerator>(
|
||||
current_max = (current_max.saturating_mul(2)).min(opts.budget_cap);
|
||||
}
|
||||
|
||||
// Phase 21(b) — structural-completion continuation loop.
|
||||
// Phase 21(b) — structural-completion continuation loop. Runs on
|
||||
// the truncated-non-empty case; empty + exhausted retries falls
|
||||
// through with empty combined and final_complete=false.
|
||||
for _ in 0..opts.max_continuations {
|
||||
if is_structurally_complete(&combined, opts.shape) {
|
||||
return Ok(ContinuableOutcome {
|
||||
@ -216,22 +198,17 @@ pub async fn generate_continuable<G: TextGenerator>(
|
||||
empty_retries,
|
||||
continuations,
|
||||
final_complete: true,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
calls,
|
||||
});
|
||||
}
|
||||
if combined.trim().is_empty() {
|
||||
// Nothing to continue from — continuing "" is identical to
|
||||
// the initial call and would loop.
|
||||
// the initial call and would loop. Bail so the caller sees
|
||||
// the failure rather than burning N extra calls.
|
||||
break;
|
||||
}
|
||||
let cont_prompt = continuation_prompt(prompt, &combined);
|
||||
let req = make_request(opts, cont_prompt, current_max.min(opts.budget_cap));
|
||||
let resp = generator.generate_text(req).await?;
|
||||
calls += 1;
|
||||
prompt_tokens = prompt_tokens.saturating_add(u32::try_from(resp.tokens_evaluated.unwrap_or(0)).unwrap_or(u32::MAX));
|
||||
completion_tokens = completion_tokens.saturating_add(u32::try_from(resp.tokens_generated.unwrap_or(0)).unwrap_or(u32::MAX));
|
||||
combined.push_str(&resp.text);
|
||||
continuations += 1;
|
||||
}
|
||||
@ -242,9 +219,6 @@ pub async fn generate_continuable<G: TextGenerator>(
|
||||
empty_retries,
|
||||
continuations,
|
||||
final_complete,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
calls,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
pub mod client;
|
||||
pub mod context;
|
||||
pub mod continuation;
|
||||
pub mod provider;
|
||||
pub mod providers;
|
||||
pub mod routing;
|
||||
pub mod service;
|
||||
pub mod tree_split;
|
||||
|
||||
@ -1,39 +0,0 @@
|
||||
use async_trait::async_trait;
|
||||
use crate::client::{GenerateRequest, GenerateResponse, EmbedRequest, EmbedResponse};
|
||||
|
||||
#[async_trait]
|
||||
pub trait ProviderAdapter: Send + Sync {
|
||||
/// Name for routing (ollama, openrouter, etc.)
|
||||
fn name(&self) -> &str;
|
||||
|
||||
/// Chat completion — returns text, model, token counts
|
||||
async fn chat(&self, req: GenerateRequest) -> Result<GenerateResponse, String>;
|
||||
|
||||
/// Embeddings — returns vectors, model, dimensions
|
||||
async fn embed(&self, req: EmbedRequest) -> Result<EmbedResponse, String>;
|
||||
|
||||
/// Unload model from VRAM (optional, no-op if not supported)
|
||||
async fn unload(&self, _model: &str) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Health check
|
||||
async fn health(&self) -> Result<serde_json::Value, String>;
|
||||
}
|
||||
|
||||
/// Routing key extracted from model name.
|
||||
/// - "qwen3.5:latest" → "ollama"
|
||||
/// - "openrouter/anthropic/claude-3.5-sonnet" → "openrouter"
|
||||
/// - "gpt-4o" → "ollama" (default)
|
||||
pub fn provider_key(model: &str) -> &'static str {
|
||||
let lower = model.to_lowercase();
|
||||
if lower.starts_with("openrouter/") {
|
||||
"openrouter"
|
||||
} else if lower.starts_with("gemini") {
|
||||
"gemini"
|
||||
} else if lower.starts_with("claude") {
|
||||
"claude"
|
||||
} else {
|
||||
"ollama" // default: local Ollama
|
||||
}
|
||||
}
|
||||
@ -1,2 +0,0 @@
|
||||
pub mod ollama;
|
||||
pub mod openrouter;
|
||||
@ -1,37 +0,0 @@
|
||||
use async_trait::async_trait;
|
||||
use crate::client::{AiClient, GenerateRequest, GenerateResponse, EmbedRequest, EmbedResponse};
|
||||
use crate::provider::ProviderAdapter;
|
||||
|
||||
pub struct OllamaAdapter {
|
||||
client: AiClient,
|
||||
}
|
||||
|
||||
impl OllamaAdapter {
|
||||
pub fn new(client: AiClient) -> Self {
|
||||
Self { client }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ProviderAdapter for OllamaAdapter {
|
||||
fn name(&self) -> &str {
|
||||
"ollama"
|
||||
}
|
||||
|
||||
async fn chat(&self, req: GenerateRequest) -> Result<GenerateResponse, String> {
|
||||
self.client.generate(req).await
|
||||
}
|
||||
|
||||
async fn embed(&self, req: EmbedRequest) -> Result<EmbedResponse, String> {
|
||||
self.client.embed(req).await
|
||||
}
|
||||
|
||||
async fn unload(&self, model: &str) -> Result<(), String> {
|
||||
let _: serde_json::Value = self.client.unload_model(model).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn health(&self) -> Result<serde_json::Value, String> {
|
||||
self.client.health().await
|
||||
}
|
||||
}
|
||||
@ -1,152 +0,0 @@
|
||||
use async_trait::async_trait;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::client::{GenerateRequest, GenerateResponse, EmbedRequest, EmbedResponse};
|
||||
use crate::provider::ProviderAdapter;
|
||||
|
||||
pub struct OpenRouterAdapter {
|
||||
client: Client,
|
||||
base_url: String,
|
||||
api_key: String,
|
||||
default_model: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OpenRouterChatRequest {
|
||||
model: String,
|
||||
messages: Vec<OpenRouterMessage>,
|
||||
temperature: Option<f64>,
|
||||
max_tokens: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OpenRouterMessage {
|
||||
role: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OpenRouterChatResponse {
|
||||
choices: Vec<OpenRouterChoice>,
|
||||
usage: OpenRouterUsage,
|
||||
model: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OpenRouterChoice {
|
||||
message: OpenRouterMessageOut,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[allow(dead_code)]
|
||||
struct OpenRouterMessageOut {
|
||||
role: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[allow(dead_code)]
|
||||
struct OpenRouterUsage {
|
||||
prompt_tokens: Option<u32>,
|
||||
completion_tokens: Option<u32>,
|
||||
total_tokens: Option<u32>,
|
||||
}
|
||||
|
||||
impl OpenRouterAdapter {
|
||||
pub fn new(base_url: &str, api_key: String, default_model: &str) -> Self {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(180))
|
||||
.build()
|
||||
.expect("failed to build HTTP client");
|
||||
Self {
|
||||
client,
|
||||
base_url: base_url.trim_end_matches('/').to_string(),
|
||||
api_key,
|
||||
default_model: default_model.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn chat_model(&self, model: &str) -> String {
|
||||
// Strip "openrouter/" prefix if present
|
||||
let m = model.trim_start_matches("openrouter/");
|
||||
if m.is_empty() || m == model {
|
||||
self.default_model.clone()
|
||||
} else {
|
||||
m.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn to_openrouter_messages(req: &GenerateRequest) -> Vec<OpenRouterMessage> {
|
||||
let mut out = vec![];
|
||||
if let Some(sys) = &req.system {
|
||||
out.push(OpenRouterMessage { role: "system".into(), content: sys.clone() });
|
||||
}
|
||||
out.push(OpenRouterMessage {
|
||||
role: "user".into(),
|
||||
content: req.prompt.clone(),
|
||||
});
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ProviderAdapter for OpenRouterAdapter {
|
||||
fn name(&self) -> &str {
|
||||
"openrouter"
|
||||
}
|
||||
|
||||
async fn chat(&self, req: GenerateRequest) -> Result<GenerateResponse, String> {
|
||||
let model = self.chat_model(req.model.as_deref().unwrap_or(""));
|
||||
let or_req = OpenRouterChatRequest {
|
||||
model: model.clone(),
|
||||
messages: OpenRouterAdapter::to_openrouter_messages(&req),
|
||||
temperature: req.temperature,
|
||||
max_tokens: req.max_tokens,
|
||||
};
|
||||
|
||||
let resp = self.client
|
||||
.post(format!("{}/chat/completions", self.base_url))
|
||||
.header("Authorization", format!("Bearer {}", self.api_key))
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&or_req)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("openrouter request failed: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
|
||||
if !status.is_success() {
|
||||
return Err(format!("openrouter error ({}): {}", status, body));
|
||||
}
|
||||
|
||||
let or_resp: OpenRouterChatResponse = serde_json::from_str(&body)
|
||||
.map_err(|e| format!("openrouter parse error: {e}"))?;
|
||||
|
||||
let choice = or_resp.choices.into_iter().next()
|
||||
.ok_or("no completion choice returned")?;
|
||||
let usage = or_resp.usage;
|
||||
|
||||
Ok(GenerateResponse {
|
||||
text: choice.message.content,
|
||||
model: or_resp.model,
|
||||
tokens_evaluated: usage.prompt_tokens.map(|n| n as u64),
|
||||
tokens_generated: usage.completion_tokens.map(|n| n as u64),
|
||||
})
|
||||
}
|
||||
|
||||
async fn embed(&self, _req: EmbedRequest) -> Result<EmbedResponse, String> {
|
||||
Err("openrouter: embed not implemented".into())
|
||||
}
|
||||
|
||||
async fn health(&self) -> Result<serde_json::Value, String> {
|
||||
// OpenRouter doesn't have a dedicated health endpoint,
|
||||
// so we just return a healthy response if the client works.
|
||||
Ok(serde_json::json!({
|
||||
"status": "ok",
|
||||
"provider": "openrouter",
|
||||
}))
|
||||
}
|
||||
}
|
||||
@ -1,119 +0,0 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct RoutingRule {
|
||||
pub model_pattern: String,
|
||||
pub provider: String,
|
||||
pub max_tokens: Option<u32>,
|
||||
pub temperature: Option<f64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RoutingEngine {
|
||||
rules: Vec<RoutingRule>,
|
||||
fallback_chain: Vec<String>,
|
||||
}
|
||||
|
||||
impl RoutingEngine {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn with_rules(mut self, rules: Vec<RoutingRule>) -> Self {
|
||||
self.rules = rules;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_fallback(mut self, chain: Vec<String>) -> Self {
|
||||
self.fallback_chain = chain;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn route(&self, model: &str) -> RouteDecision {
|
||||
let lower = model.to_lowercase();
|
||||
|
||||
for rule in &self.rules {
|
||||
if glob_match(&rule.model_pattern.to_lowercase(), &lower) {
|
||||
return RouteDecision {
|
||||
provider: rule.provider.clone(),
|
||||
model: model.to_string(),
|
||||
max_tokens: rule.max_tokens,
|
||||
temperature: rule.temperature,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(first) = self.fallback_chain.first() {
|
||||
RouteDecision {
|
||||
provider: first.clone(),
|
||||
model: model.to_string(),
|
||||
max_tokens: None,
|
||||
temperature: None,
|
||||
}
|
||||
} else {
|
||||
RouteDecision {
|
||||
provider: "ollama".to_string(),
|
||||
model: model.to_string(),
|
||||
max_tokens: None,
|
||||
temperature: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RouteDecision {
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub max_tokens: Option<u32>,
|
||||
pub temperature: Option<f64>,
|
||||
}
|
||||
|
||||
fn glob_match(pattern: &str, name: &str) -> bool {
|
||||
if !pattern.contains('*') { return pattern == name; }
|
||||
let parts: Vec<&str> = pattern.split('*').collect();
|
||||
// Multi-* support: first must be prefix, last must be suffix, each
|
||||
// interior piece must appear in order. Fixes the iter-9 finding
|
||||
// where gpt-*-large* silently fell through to an exact-match path.
|
||||
// Also removes the dead `parts.len() == 1` branch that accessed
|
||||
// parts[1] and would panic if ever reached (unreachable today
|
||||
// since split('*') on a string containing '*' always yields ≥2).
|
||||
if !name.starts_with(parts[0]) || !name.ends_with(parts.last().unwrap()) { return false; }
|
||||
let mut cursor = parts[0].len();
|
||||
parts[1..parts.len() - 1].iter().all(|mid| {
|
||||
name[cursor..].find(mid).map(|pos| { cursor += pos + mid.len(); true }).unwrap_or(false)
|
||||
})
|
||||
}
|
||||
|
||||
impl Default for RoutingRule {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
model_pattern: "*".to_string(),
|
||||
provider: "ollama".to_string(),
|
||||
max_tokens: None,
|
||||
temperature: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod glob_match_tests {
|
||||
use super::glob_match;
|
||||
|
||||
#[test] fn exact_match() { assert!(glob_match("gpt-oss:120b", "gpt-oss:120b")); }
|
||||
#[test] fn exact_mismatch() { assert!(!glob_match("a", "b")); }
|
||||
#[test] fn leading_wildcard() { assert!(glob_match("*:120b", "gpt-oss:120b")); }
|
||||
#[test] fn trailing_wildcard() { assert!(glob_match("gpt-oss:*", "gpt-oss:120b")); }
|
||||
#[test] fn bare_wildcard() { assert!(glob_match("*", "anything")); }
|
||||
#[test] fn multi_wildcard_in_order() { assert!(glob_match("gpt-*-oss-*", "gpt-4-oss-120b")); }
|
||||
#[test] fn multi_wildcard_wrong_order() { assert!(!glob_match("b*a*", "abba")); }
|
||||
#[test] fn multi_wildcard_panic_safety() {
|
||||
// Regression: earlier impl had an unreachable `parts.len() == 1`
|
||||
// branch that indexed parts[1] — would panic if ever hit. Now
|
||||
// the split('*') invariant guarantees ≥2 parts when * present,
|
||||
// and we handle all N-part cases explicitly.
|
||||
assert!(glob_match("a*b*c", "abc"));
|
||||
assert!(glob_match("a*b*c", "axxxbxxxc"));
|
||||
assert!(!glob_match("a*b*c", "xxxbxxx"));
|
||||
}
|
||||
}
|
||||
@ -19,9 +19,8 @@
|
||||
//! we bubble the error up rather than silently truncating. That's the
|
||||
//! whole point of Phase 21.
|
||||
|
||||
use crate::context::{assert_context_budget, BudgetOpts, overflow_message,
|
||||
use crate::context::{assert_context_budget, BudgetOpts, estimate_tokens, overflow_message,
|
||||
DEFAULT_MAX_TOKENS, DEFAULT_SAFETY_MARGIN};
|
||||
use shared::model_matrix::ModelMatrix;
|
||||
use crate::continuation::{generate_continuable, ContinuableOpts, ResponseShape, TextGenerator};
|
||||
|
||||
/// Callback signatures — caller supplies closures that stitch the
|
||||
@ -81,12 +80,12 @@ pub struct TreeSplitResult {
|
||||
/// by `\n— shard N/M digest —\n` so we can find the first one and
|
||||
/// chop everything before its successor.
|
||||
fn truncate_scratchpad(scratchpad: &mut String, budget_tokens: usize) -> bool {
|
||||
if ModelMatrix::estimate_tokens(scratchpad) <= budget_tokens { return false; }
|
||||
if estimate_tokens(scratchpad) <= budget_tokens { return false; }
|
||||
// Find the second delimiter — everything before it gets dropped.
|
||||
const DELIM_PREFIX: &str = "\n— shard ";
|
||||
let mut cursor = 0;
|
||||
let mut truncated = false;
|
||||
while ModelMatrix::estimate_tokens(&scratchpad[cursor..]) > budget_tokens {
|
||||
while estimate_tokens(&scratchpad[cursor..]) > budget_tokens {
|
||||
// Skip past a leading delimiter (if we're sitting on one from
|
||||
// a previous iteration), then find the next.
|
||||
let search_from = cursor + if scratchpad[cursor..].starts_with(DELIM_PREFIX) {
|
||||
@ -279,7 +278,7 @@ mod tests {
|
||||
// Scratchpad should still fit roughly within the budget
|
||||
// (post-truncation); the estimator uses chars/4 so the bound
|
||||
// is ~budget*4 chars. Give some slack for the delimiter.
|
||||
let scratchpad_tokens = ModelMatrix::estimate_tokens(&result.scratchpad);
|
||||
let scratchpad_tokens = estimate_tokens(&result.scratchpad);
|
||||
assert!(scratchpad_tokens <= opts.scratchpad_budget * 2,
|
||||
"scratchpad {} tokens vs budget {}", scratchpad_tokens, opts.scratchpad_budget);
|
||||
}
|
||||
|
||||
@ -31,10 +31,6 @@ pub fn router(registry: Registry) -> Router {
|
||||
// Phase 17: model profiles
|
||||
.route("/profiles", post(create_profile).get(list_profiles))
|
||||
.route("/profiles/{id}", get(get_profile).delete(delete_profile))
|
||||
// Phase 41: profile-type routes
|
||||
.route("/profiles/retrieval", get(list_retrieval_profiles))
|
||||
.route("/profiles/memory", get(list_memory_profiles))
|
||||
.route("/profiles/observer", get(list_observer_profiles))
|
||||
.with_state(registry)
|
||||
}
|
||||
|
||||
@ -402,9 +398,6 @@ struct CreateProfileRequest {
|
||||
/// indexes to. Defaults to Parquet+HNSW.
|
||||
#[serde(default)]
|
||||
vector_backend: shared::types::VectorBackend,
|
||||
/// Phase 41: Profile type for routing + activation behavior.
|
||||
#[serde(default)]
|
||||
profile_type: shared::types::ProfileType,
|
||||
}
|
||||
|
||||
fn default_embed_model_req() -> String { "nomic-embed-text".to_string() }
|
||||
@ -424,7 +417,6 @@ async fn create_profile(
|
||||
created_by: req.created_by,
|
||||
bucket: req.bucket,
|
||||
vector_backend: req.vector_backend,
|
||||
profile_type: req.profile_type,
|
||||
};
|
||||
match registry.put_profile(profile).await {
|
||||
Ok(p) => Ok((StatusCode::CREATED, Json(p))),
|
||||
@ -436,24 +428,6 @@ async fn list_profiles(State(registry): State<Registry>) -> impl IntoResponse {
|
||||
Json(registry.list_profiles().await)
|
||||
}
|
||||
|
||||
async fn list_retrieval_profiles(State(registry): State<Registry>) -> impl IntoResponse {
|
||||
let all = registry.list_profiles().await;
|
||||
let retrieval: Vec<_> = all.into_iter().filter(|p| p.profile_type == shared::types::ProfileType::Retrieval).collect();
|
||||
Json(retrieval)
|
||||
}
|
||||
|
||||
async fn list_memory_profiles(State(registry): State<Registry>) -> impl IntoResponse {
|
||||
let all = registry.list_profiles().await;
|
||||
let memory: Vec<_> = all.into_iter().filter(|p| p.profile_type == shared::types::ProfileType::Memory).collect();
|
||||
Json(memory)
|
||||
}
|
||||
|
||||
async fn list_observer_profiles(State(registry): State<Registry>) -> impl IntoResponse {
|
||||
let all = registry.list_profiles().await;
|
||||
let observer: Vec<_> = all.into_iter().filter(|p| p.profile_type == shared::types::ProfileType::Observer).collect();
|
||||
Json(observer)
|
||||
}
|
||||
|
||||
async fn get_profile(
|
||||
State(registry): State<Registry>,
|
||||
Path(id): Path<String>,
|
||||
|
||||
@ -12,8 +12,6 @@ aibridge = { path = "../aibridge" }
|
||||
ingestd = { path = "../ingestd" }
|
||||
vectord = { path = "../vectord" }
|
||||
journald = { path = "../journald" }
|
||||
truth = { path = "../truth" }
|
||||
validator = { path = "../validator" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
@ -31,4 +29,3 @@ tracing-opentelemetry = { workspace = true }
|
||||
arrow = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
toml = { workspace = true }
|
||||
|
||||
@ -93,7 +93,7 @@ impl AccessControl {
|
||||
self.roles.write().await.insert(role.agent_name.clone(), role);
|
||||
}
|
||||
|
||||
/// Get an agent's role. Called by `GET /access/roles/{agent}`.
|
||||
/// Get an agent's role.
|
||||
pub async fn get_role(&self, agent: &str) -> Option<AgentRole> {
|
||||
self.roles.read().await.get(agent).cloned()
|
||||
}
|
||||
@ -113,7 +113,6 @@ impl AccessControl {
|
||||
}
|
||||
|
||||
/// Determine which fields should be masked for an agent.
|
||||
#[allow(dead_code)]
|
||||
pub async fn masked_fields(
|
||||
&self,
|
||||
agent: &str,
|
||||
@ -139,7 +138,6 @@ impl AccessControl {
|
||||
}
|
||||
|
||||
/// Log a query for audit.
|
||||
#[allow(dead_code)]
|
||||
pub async fn log_query(&self, audit: QueryAudit) {
|
||||
self.audit_log.write().await.push(audit);
|
||||
}
|
||||
@ -151,9 +149,6 @@ impl AccessControl {
|
||||
log[start..].iter().rev().cloned().collect()
|
||||
}
|
||||
|
||||
/// Reports whether access-control enforcement is active.
|
||||
/// Called by `GET /access/enabled` — ops tooling / dashboards poll
|
||||
/// this to confirm the auth posture of the running gateway.
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.enabled
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
use axum::{
|
||||
Json, Router,
|
||||
extract::{Path, Query, State},
|
||||
extract::{Query, State},
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
@ -13,12 +13,6 @@ pub fn router(ac: AccessControl) -> Router {
|
||||
Router::new()
|
||||
.route("/roles", get(list_roles))
|
||||
.route("/roles", post(set_role))
|
||||
// Scrum iter 11 / P13-001 finding: get_role was #[allow(dead_code)]
|
||||
// because nothing called it — dead until exposed. Route activates it.
|
||||
// Returns 404 when the agent isn't registered so clients can
|
||||
// distinguish "missing role" from "access denied."
|
||||
.route("/roles/{agent}", get(get_role))
|
||||
.route("/enabled", get(enabled_status))
|
||||
.route("/audit", get(query_audit))
|
||||
.route("/check", post(check_access))
|
||||
.with_state(ac)
|
||||
@ -66,17 +60,3 @@ async fn check_access(
|
||||
"allowed": allowed,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn get_role(
|
||||
State(ac): State<AccessControl>,
|
||||
Path(agent): Path<String>,
|
||||
) -> impl IntoResponse {
|
||||
match ac.get_role(&agent).await {
|
||||
Some(role) => Ok(Json(role)),
|
||||
None => Err((StatusCode::NOT_FOUND, format!("no role registered for agent '{agent}'"))),
|
||||
}
|
||||
}
|
||||
|
||||
async fn enabled_status(State(ac): State<AccessControl>) -> impl IntoResponse {
|
||||
Json(serde_json::json!({ "enabled": ac.is_enabled() }))
|
||||
}
|
||||
|
||||
@ -5,51 +5,30 @@ use axum::{
|
||||
response::Response,
|
||||
};
|
||||
|
||||
// API key auth middleware. Checks X-API-Key header against configured key.
|
||||
// Fixed P5-001 (2026-04-23): previously #[allow(dead_code)] — the function
|
||||
// existed but was never layered onto the router, so [auth] enabled=true
|
||||
// silently enforced nothing. Now wired via from_fn_with_state in main.rs.
|
||||
/// API key auth middleware. Checks X-API-Key header against configured key.
|
||||
pub async fn api_key_auth(
|
||||
axum::extract::State(expected): axum::extract::State<ApiKey>,
|
||||
request: Request,
|
||||
next: Next,
|
||||
) -> Result<Response, StatusCode> {
|
||||
// /health stays public (LB/systemd probes). Every other route is gated.
|
||||
if request.uri().path() == "/health" {
|
||||
return Ok(next.run(request).await);
|
||||
}
|
||||
// Get the expected key from the request extensions (set by the layer)
|
||||
let expected_key = request.extensions().get::<ApiKey>().cloned();
|
||||
|
||||
let provided = request
|
||||
.headers()
|
||||
.get("x-api-key")
|
||||
.and_then(|v| v.to_str().ok());
|
||||
if let Some(expected) = expected_key {
|
||||
let provided = request
|
||||
.headers()
|
||||
.get("x-api-key")
|
||||
.and_then(|v| v.to_str().ok());
|
||||
|
||||
// Constant-time-ish eq on the raw bytes; good enough for a shared-secret
|
||||
// X-API-Key. Timing-attack resistance here matters less than the
|
||||
// equivalent HMAC check would; adopt subtle crate if key-space grows.
|
||||
match provided {
|
||||
Some(key) if eq_ct(key.as_bytes(), expected.0.as_bytes()) => {
|
||||
Ok(next.run(request).await)
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
path = %request.uri().path(),
|
||||
"unauthorized request: missing or invalid API key",
|
||||
);
|
||||
Err(StatusCode::UNAUTHORIZED)
|
||||
match provided {
|
||||
Some(key) if key == expected.0 => {}
|
||||
_ => {
|
||||
tracing::warn!("unauthorized request: missing or invalid API key");
|
||||
return Err(StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn eq_ct(a: &[u8], b: &[u8]) -> bool {
|
||||
if a.len() != b.len() {
|
||||
return false;
|
||||
}
|
||||
let mut diff: u8 = 0;
|
||||
for (x, y) in a.iter().zip(b.iter()) {
|
||||
diff |= x ^ y;
|
||||
}
|
||||
diff == 0
|
||||
Ok(next.run(request).await)
|
||||
}
|
||||
|
||||
/// Wrapper type for the API key, stored in request extensions.
|
||||
|
||||
@ -1,37 +0,0 @@
|
||||
//! Cross-runtime parity helper for `extract_json`.
|
||||
//!
|
||||
//! Reads a single model-output string from stdin, runs the Rust
|
||||
//! extract_json, prints `{"matched": bool, "value": <object|null>}`
|
||||
//! to stdout as JSON. Exit 0 on success, exit 1 on internal error.
|
||||
//!
|
||||
//! The Go counterpart lives at
|
||||
//! `golangLAKEHOUSE/internal/validator/iterate.go::ExtractJSON`. The
|
||||
//! parity probe at
|
||||
//! `golangLAKEHOUSE/scripts/cutover/parity/extract_json_parity.sh`
|
||||
//! feeds the same fixtures through both and diffs the outputs.
|
||||
//!
|
||||
//! Usage:
|
||||
//! echo '<raw model output>' | parity_extract_json
|
||||
//! parity_extract_json <<< '...'
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
fn main() {
|
||||
let mut buf = String::new();
|
||||
if let Err(e) = std::io::stdin().read_to_string(&mut buf) {
|
||||
eprintln!("read stdin: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
let result = gateway::v1::iterate::extract_json(&buf);
|
||||
let body = serde_json::json!({
|
||||
"matched": result.is_some(),
|
||||
"value": result.unwrap_or(serde_json::Value::Null),
|
||||
});
|
||||
match serde_json::to_string(&body) {
|
||||
Ok(s) => println!("{s}"),
|
||||
Err(e) => {
|
||||
eprintln!("serialize result: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,71 +0,0 @@
|
||||
//! Cross-runtime parity helper for `SessionRecord` JSON shape.
|
||||
//!
|
||||
//! Reads a fixture JSON on stdin, builds a `SessionRecord`, emits
|
||||
//! one JSONL row on stdout. Used by
|
||||
//! `golangLAKEHOUSE/scripts/cutover/parity/session_log_parity.sh`
|
||||
//! to verify the Rust gateway's session log shape stays byte-equal
|
||||
//! to the Go-side validatord's `validator.SessionRecord` (commit
|
||||
//! 1a3a82a in golangLAKEHOUSE).
|
||||
|
||||
use gateway::v1::session_log::{SessionAttemptRecord, SessionRecord, SESSION_RECORD_SCHEMA};
|
||||
use serde::Deserialize;
|
||||
use std::io::Read;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct FixtureInput {
|
||||
session_id: String,
|
||||
kind: String,
|
||||
model: String,
|
||||
provider: String,
|
||||
prompt: String,
|
||||
iterations: u32,
|
||||
max_iterations: u32,
|
||||
final_verdict: String,
|
||||
attempts: Vec<SessionAttemptRecord>,
|
||||
#[serde(default)]
|
||||
artifact: Option<serde_json::Value>,
|
||||
#[serde(default)]
|
||||
grounded_in_roster: Option<bool>,
|
||||
duration_ms: u64,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut buf = String::new();
|
||||
if let Err(e) = std::io::stdin().read_to_string(&mut buf) {
|
||||
eprintln!("read stdin: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
let input: FixtureInput = match serde_json::from_str(&buf) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
eprintln!("parse stdin: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
let rec = SessionRecord {
|
||||
schema: SESSION_RECORD_SCHEMA.to_string(),
|
||||
session_id: input.session_id,
|
||||
// Pinned timestamp so both runtimes' rows compare byte-equal
|
||||
// when the test wrapper normalizes on `daemon` only.
|
||||
timestamp: "2026-01-01T00:00:00+00:00".to_string(),
|
||||
daemon: "gateway".to_string(),
|
||||
kind: input.kind,
|
||||
model: input.model,
|
||||
provider: input.provider,
|
||||
prompt: input.prompt,
|
||||
iterations: input.iterations,
|
||||
max_iterations: input.max_iterations,
|
||||
final_verdict: input.final_verdict,
|
||||
attempts: input.attempts,
|
||||
artifact: input.artifact,
|
||||
grounded_in_roster: input.grounded_in_roster,
|
||||
duration_ms: input.duration_ms,
|
||||
};
|
||||
match serde_json::to_string(&rec) {
|
||||
Ok(s) => println!("{s}"),
|
||||
Err(e) => {
|
||||
eprintln!("marshal: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,388 +0,0 @@
|
||||
//! KB context loader — reads recent signal from `data/_kb/*.jsonl` for
|
||||
//! a given sig_hash + task_class and returns a compact summary.
|
||||
//!
|
||||
//! This is the "pipe to the overviewer" from the 2026-04-23 session:
|
||||
//! the overseer tier (T3, gpt-oss:120b) consumes this context before
|
||||
//! generating a correction, so its suggestions are informed by
|
||||
//! historical cost / latency / outcome / prior-correction patterns
|
||||
//! across ALL profiles that have run this task class — not just the
|
||||
//! single current loop.
|
||||
//!
|
||||
//! Hot-swap profiles read the SAME pool. When a profile activates and
|
||||
//! starts iterating, its KB context is the shared surface — one
|
||||
//! profile's learning becomes every profile's starting point.
|
||||
//!
|
||||
//! Best-effort throughout: missing files, corrupt rows, empty
|
||||
//! directories all produce an empty KbContext. The overseer works
|
||||
//! fine with no history; we just can't seed it then.
|
||||
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
|
||||
/// Compact summary returned to the overseer. Bounded size — recent
|
||||
/// outcomes + corrections plus rolled-up rates. Goal is to fit in a
|
||||
/// prompt without eating the overseer's context budget.
|
||||
#[derive(Debug, Clone, Default, Serialize)]
|
||||
pub struct KbContext {
|
||||
pub sig_hash: String,
|
||||
pub task_class: String,
|
||||
pub recent_outcomes: Vec<OutcomeSummary>,
|
||||
pub recent_corrections: Vec<CorrectionSummary>,
|
||||
pub success_rate: Option<f64>,
|
||||
pub avg_turns: Option<f64>,
|
||||
pub avg_latency_ms: Option<u64>,
|
||||
pub total_observed: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct OutcomeSummary {
|
||||
pub created_at: String,
|
||||
pub ok: bool,
|
||||
pub polarity: String,
|
||||
pub turns: u32,
|
||||
pub latency_ms: u64,
|
||||
pub total_tokens: u64,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct CorrectionSummary {
|
||||
pub created_at: String,
|
||||
pub reason: String,
|
||||
pub correction_preview: String, // first 300 chars
|
||||
pub applied_at_turn: u32,
|
||||
}
|
||||
|
||||
const OUTCOMES_PATH: &str = "data/_kb/outcomes.jsonl";
|
||||
const CORRECTIONS_PATH: &str = "data/_kb/overseer_corrections.jsonl";
|
||||
const RECENT_OUTCOME_LIMIT: usize = 5;
|
||||
const RECENT_CORRECTION_LIMIT: usize = 3;
|
||||
const AGGREGATE_WINDOW: usize = 50;
|
||||
|
||||
impl KbContext {
|
||||
/// Build context from the default KB paths.
|
||||
pub async fn load_for(sig_hash: &str, task_class: &str) -> Self {
|
||||
Self::load_from(
|
||||
sig_hash, task_class,
|
||||
Path::new(OUTCOMES_PATH), Path::new(CORRECTIONS_PATH),
|
||||
).await
|
||||
}
|
||||
|
||||
/// Path-taking variant — tests inject tmp files without touching
|
||||
/// the real KB directory (same pattern as append_outcomes_row_at).
|
||||
pub async fn load_from(
|
||||
sig_hash: &str,
|
||||
task_class: &str,
|
||||
outcomes_path: &Path,
|
||||
corrections_path: &Path,
|
||||
) -> Self {
|
||||
let mut ctx = KbContext {
|
||||
sig_hash: sig_hash.to_string(),
|
||||
task_class: task_class.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Scan outcomes — matches on sig_hash primary, task_class
|
||||
// secondary (so different geos for the same task_class still
|
||||
// contribute to aggregate rates even though they won't make
|
||||
// the top-5 recent). The bounded window keeps scan cost
|
||||
// linear in file size — we're reading tail only.
|
||||
let outcome_rows = tail_matching(
|
||||
outcomes_path, AGGREGATE_WINDOW * 4,
|
||||
|row| {
|
||||
let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let row_tc = row.get("task_class").and_then(|v| v.as_str()).unwrap_or("");
|
||||
row_sig == sig_hash || row_tc == task_class
|
||||
},
|
||||
).await;
|
||||
|
||||
// Recent outcomes: exact sig_hash match first (strongest
|
||||
// signal), then task_class fallback up to the limit.
|
||||
let mut exact: Vec<OutcomeSummary> = Vec::new();
|
||||
let mut loose: Vec<OutcomeSummary> = Vec::new();
|
||||
for row in &outcome_rows {
|
||||
let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let summary = summarize_outcome(row);
|
||||
if row_sig == sig_hash { exact.push(summary); }
|
||||
else { loose.push(summary); }
|
||||
}
|
||||
ctx.recent_outcomes = exact.into_iter().rev().take(RECENT_OUTCOME_LIMIT).collect();
|
||||
if ctx.recent_outcomes.len() < RECENT_OUTCOME_LIMIT {
|
||||
let need = RECENT_OUTCOME_LIMIT - ctx.recent_outcomes.len();
|
||||
ctx.recent_outcomes.extend(loose.into_iter().rev().take(need));
|
||||
}
|
||||
|
||||
// Aggregate rates across the full matched window (both
|
||||
// sig_hash and task_class matches — gives a stable rate even
|
||||
// on sparse sig_hash history).
|
||||
let window = outcome_rows.iter().rev().take(AGGREGATE_WINDOW);
|
||||
let mut ok_count = 0u32;
|
||||
let mut total = 0u32;
|
||||
let mut turn_sum = 0u32;
|
||||
let mut latency_sum = 0u64;
|
||||
for row in window {
|
||||
total += 1;
|
||||
if row.get("ok").and_then(|v| v.as_bool()).unwrap_or(false) { ok_count += 1; }
|
||||
turn_sum += row.get("turns").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
|
||||
latency_sum += row.get("usage")
|
||||
.and_then(|u| u.get("latency_ms"))
|
||||
.and_then(|v| v.as_u64()).unwrap_or(0);
|
||||
}
|
||||
if total > 0 {
|
||||
ctx.total_observed = total;
|
||||
ctx.success_rate = Some(ok_count as f64 / total as f64);
|
||||
ctx.avg_turns = Some(turn_sum as f64 / total as f64);
|
||||
ctx.avg_latency_ms = Some(latency_sum / total as u64);
|
||||
}
|
||||
|
||||
// Overseer corrections. Prefer sig_hash match; fall back to
|
||||
// task_class. The overseer reading its OWN prior corrections
|
||||
// is the main point — if the last 3 attempts produced
|
||||
// corrections X, Y, Z, the new correction should acknowledge
|
||||
// those patterns rather than suggest X for the fourth time.
|
||||
let correction_rows = tail_matching(
|
||||
corrections_path, RECENT_CORRECTION_LIMIT * 4,
|
||||
|row| {
|
||||
let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let row_tc = row.get("task_class").and_then(|v| v.as_str()).unwrap_or("");
|
||||
row_sig == sig_hash || row_tc == task_class
|
||||
},
|
||||
).await;
|
||||
let mut c_exact: Vec<CorrectionSummary> = Vec::new();
|
||||
let mut c_loose: Vec<CorrectionSummary> = Vec::new();
|
||||
for row in &correction_rows {
|
||||
let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let summary = summarize_correction(row);
|
||||
if row_sig == sig_hash { c_exact.push(summary); }
|
||||
else { c_loose.push(summary); }
|
||||
}
|
||||
ctx.recent_corrections = c_exact.into_iter().rev().take(RECENT_CORRECTION_LIMIT).collect();
|
||||
if ctx.recent_corrections.len() < RECENT_CORRECTION_LIMIT {
|
||||
let need = RECENT_CORRECTION_LIMIT - ctx.recent_corrections.len();
|
||||
ctx.recent_corrections.extend(c_loose.into_iter().rev().take(need));
|
||||
}
|
||||
|
||||
ctx
|
||||
}
|
||||
|
||||
/// Compact string form for the overseer prompt. Deterministic
|
||||
/// ordering + bounded length so prompt caching stays stable
|
||||
/// across iterations on the same task.
|
||||
pub fn to_prompt_section(&self) -> String {
|
||||
let mut s = String::new();
|
||||
s.push_str("## Knowledge Base Context\n");
|
||||
if let (Some(rate), Some(turns), Some(lat)) = (self.success_rate, self.avg_turns, self.avg_latency_ms) {
|
||||
s.push_str(&format!(
|
||||
"Across {} prior similar runs: success_rate={:.1}%, avg_turns={:.1}, avg_latency_ms={}\n",
|
||||
self.total_observed, rate * 100.0, turns, lat,
|
||||
));
|
||||
} else {
|
||||
s.push_str("No prior similar runs recorded.\n");
|
||||
}
|
||||
|
||||
if !self.recent_outcomes.is_empty() {
|
||||
s.push_str(&format!("\nRecent {} outcomes:\n", self.recent_outcomes.len()));
|
||||
for o in &self.recent_outcomes {
|
||||
let err = o.error.as_deref().map(|e| format!(" — {}", truncate(e, 80))).unwrap_or_default();
|
||||
s.push_str(&format!(
|
||||
" [{}] ok={} turns={} tokens={} lat={}ms{}\n",
|
||||
&o.created_at[..19.min(o.created_at.len())],
|
||||
o.ok, o.turns, o.total_tokens, o.latency_ms, err,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if !self.recent_corrections.is_empty() {
|
||||
s.push_str(&format!("\nRecent {} overseer corrections (yours — don't repeat):\n", self.recent_corrections.len()));
|
||||
for c in &self.recent_corrections {
|
||||
s.push_str(&format!(
|
||||
" [{}] turn={} reason={} correction={}\n",
|
||||
&c.created_at[..19.min(c.created_at.len())],
|
||||
c.applied_at_turn,
|
||||
truncate(&c.reason, 40),
|
||||
truncate(&c.correction_preview, 200),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
s
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize_outcome(row: &serde_json::Value) -> OutcomeSummary {
|
||||
OutcomeSummary {
|
||||
created_at: row.get("created_at").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
ok: row.get("ok").and_then(|v| v.as_bool()).unwrap_or(false),
|
||||
polarity: row.get("polarity").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
turns: row.get("turns").and_then(|v| v.as_u64()).unwrap_or(0) as u32,
|
||||
latency_ms: row.get("usage").and_then(|u| u.get("latency_ms"))
|
||||
.and_then(|v| v.as_u64()).unwrap_or(0),
|
||||
total_tokens: row.get("usage").and_then(|u| u.get("total_tokens"))
|
||||
.and_then(|v| v.as_u64()).unwrap_or(0),
|
||||
error: row.get("error").and_then(|v| v.as_str()).map(String::from),
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize_correction(row: &serde_json::Value) -> CorrectionSummary {
|
||||
let preview = row.get("correction").and_then(|v| v.as_str()).unwrap_or("");
|
||||
CorrectionSummary {
|
||||
created_at: row.get("created_at").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
reason: row.get("reason").and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
||||
correction_preview: truncate(preview, 300),
|
||||
applied_at_turn: row.get("applied_at_turn").and_then(|v| v.as_u64()).unwrap_or(0) as u32,
|
||||
}
|
||||
}
|
||||
|
||||
fn truncate(s: &str, n: usize) -> String {
|
||||
if s.len() <= n { s.to_string() } else { format!("{}…", &s[..n]) }
|
||||
}
|
||||
|
||||
/// Read a JSONL file from the tail, returning at most `limit` rows
|
||||
/// that match `filter`. Missing file returns empty. Corrupt lines are
|
||||
/// skipped. Limit is honored from the tail — a full-file scan with an
|
||||
/// in-memory ring would be wasteful for large outcomes histories, but
|
||||
/// we cap at reading the whole file and filtering post-hoc for now
|
||||
/// (reverse-seek line iteration is a real engineering task and the
|
||||
/// file is bounded by ingest rate; revisit when it bites).
|
||||
async fn tail_matching<F>(
|
||||
path: &Path,
|
||||
limit: usize,
|
||||
filter: F,
|
||||
) -> Vec<serde_json::Value>
|
||||
where
|
||||
F: Fn(&serde_json::Value) -> bool,
|
||||
{
|
||||
let Ok(file) = tokio::fs::File::open(path).await else { return Vec::new(); };
|
||||
let reader = tokio::io::BufReader::new(file);
|
||||
let mut lines = reader.lines();
|
||||
let mut matches: Vec<serde_json::Value> = Vec::new();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
let Ok(v) = serde_json::from_str::<serde_json::Value>(&line) else { continue };
|
||||
if filter(&v) {
|
||||
matches.push(v);
|
||||
if matches.len() > limit {
|
||||
// Keep the most-recent window only — drop from the
|
||||
// front as we go rather than buffering everything.
|
||||
matches.remove(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
matches
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
async fn write_fixture(path: &Path, rows: Vec<serde_json::Value>) {
|
||||
if let Some(dir) = path.parent() {
|
||||
tokio::fs::create_dir_all(dir).await.unwrap();
|
||||
}
|
||||
let mut f = tokio::fs::OpenOptions::new()
|
||||
.create(true).write(true).truncate(true).open(path).await.unwrap();
|
||||
for r in rows {
|
||||
let mut line = serde_json::to_string(&r).unwrap();
|
||||
line.push('\n');
|
||||
f.write_all(line.as_bytes()).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn tmp_path(name: &str) -> std::path::PathBuf {
|
||||
let nanos = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_nanos();
|
||||
std::env::temp_dir().join(format!("lh_kb_ctx_{}_{}_{}", std::process::id(), nanos, name))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn empty_files_produce_empty_context() {
|
||||
let op = tmp_path("outcomes.jsonl");
|
||||
let cp = tmp_path("corrections.jsonl");
|
||||
let ctx = KbContext::load_from("sig123", "staffing.fill", &op, &cp).await;
|
||||
assert!(ctx.recent_outcomes.is_empty());
|
||||
assert!(ctx.recent_corrections.is_empty());
|
||||
assert!(ctx.success_rate.is_none());
|
||||
assert_eq!(ctx.total_observed, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn exact_sig_hash_matches_take_priority() {
|
||||
let op = tmp_path("outcomes.jsonl");
|
||||
let cp = tmp_path("corrections.jsonl");
|
||||
write_fixture(&op, vec![
|
||||
// Other sig_hash, same task_class — loose match
|
||||
serde_json::json!({
|
||||
"sig_hash": "other", "task_class": "staffing.fill",
|
||||
"ok": false, "polarity": "failure_pattern", "turns": 1,
|
||||
"usage": {"latency_ms": 1000, "total_tokens": 100},
|
||||
"created_at": "2026-04-22T10:00:00Z",
|
||||
}),
|
||||
// Exact sig_hash — should lead
|
||||
serde_json::json!({
|
||||
"sig_hash": "sig123", "task_class": "staffing.fill",
|
||||
"ok": true, "polarity": "success_confirmation", "turns": 3,
|
||||
"usage": {"latency_ms": 2000, "total_tokens": 500},
|
||||
"created_at": "2026-04-23T10:00:00Z",
|
||||
}),
|
||||
]).await;
|
||||
write_fixture(&cp, vec![]).await;
|
||||
|
||||
let ctx = KbContext::load_from("sig123", "staffing.fill", &op, &cp).await;
|
||||
assert_eq!(ctx.recent_outcomes.len(), 2);
|
||||
assert_eq!(ctx.recent_outcomes[0].created_at, "2026-04-23T10:00:00Z");
|
||||
assert_eq!(ctx.recent_outcomes[0].ok, true);
|
||||
assert_eq!(ctx.total_observed, 2);
|
||||
assert!((ctx.success_rate.unwrap() - 0.5).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn corrupt_rows_are_skipped() {
|
||||
let op = tmp_path("outcomes.jsonl");
|
||||
let cp = tmp_path("corrections.jsonl");
|
||||
// Mix valid + invalid — invalid should be silently skipped.
|
||||
if let Some(dir) = op.parent() { tokio::fs::create_dir_all(dir).await.unwrap(); }
|
||||
tokio::fs::write(&op, "not json\n{\"sig_hash\":\"sig1\",\"task_class\":\"tc\",\"ok\":true,\"turns\":1,\"usage\":{}}\ngarbage\n").await.unwrap();
|
||||
write_fixture(&cp, vec![]).await;
|
||||
let ctx = KbContext::load_from("sig1", "tc", &op, &cp).await;
|
||||
assert_eq!(ctx.recent_outcomes.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn corrections_preview_is_truncated() {
|
||||
let op = tmp_path("outcomes.jsonl");
|
||||
let cp = tmp_path("corrections.jsonl");
|
||||
let long = "x".repeat(500);
|
||||
write_fixture(&op, vec![]).await;
|
||||
write_fixture(&cp, vec![serde_json::json!({
|
||||
"sig_hash": "sig1", "task_class": "tc",
|
||||
"reason": "abort", "correction": long, "applied_at_turn": 3,
|
||||
"created_at": "2026-04-23T10:00:00Z",
|
||||
})]).await;
|
||||
let ctx = KbContext::load_from("sig1", "tc", &op, &cp).await;
|
||||
assert_eq!(ctx.recent_corrections.len(), 1);
|
||||
// 300-char cap + 3-byte UTF-8 ellipsis character = 303-byte worst case.
|
||||
assert!(ctx.recent_corrections[0].correction_preview.len() <= 303);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prompt_section_is_stable_for_empty_context() {
|
||||
let ctx = KbContext::default();
|
||||
let s = ctx.to_prompt_section();
|
||||
assert!(s.contains("No prior similar runs recorded"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prompt_section_reports_aggregate_rates() {
|
||||
let ctx = KbContext {
|
||||
total_observed: 10,
|
||||
success_rate: Some(0.7),
|
||||
avg_turns: Some(4.2),
|
||||
avg_latency_ms: Some(45000),
|
||||
..Default::default()
|
||||
};
|
||||
let s = ctx.to_prompt_section();
|
||||
assert!(s.contains("success_rate=70.0%"));
|
||||
assert!(s.contains("avg_turns=4.2"));
|
||||
assert!(s.contains("avg_latency_ms=45000"));
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,19 +0,0 @@
|
||||
//! Library facade for the gateway crate so sub-binaries (e.g.
|
||||
//! `parity_extract_json`) can reuse the same modules the gateway
|
||||
//! binary uses.
|
||||
//!
|
||||
//! Added 2026-05-02 to support the cross-runtime parity probe at
|
||||
//! `golangLAKEHOUSE/scripts/cutover/parity/extract_json_parity.sh`.
|
||||
//! `extract_json` is the load-bearing public surface for that probe.
|
||||
//!
|
||||
//! main.rs still uses local `mod foo;` declarations independently —
|
||||
//! adding this file is purely additive (the binary's module tree is
|
||||
//! unchanged).
|
||||
|
||||
pub mod access;
|
||||
pub mod access_service;
|
||||
pub mod auth;
|
||||
pub mod execution_loop;
|
||||
pub mod observability;
|
||||
pub mod tools;
|
||||
pub mod v1;
|
||||
@ -1,7 +1,6 @@
|
||||
mod access;
|
||||
mod access_service;
|
||||
mod auth;
|
||||
mod execution_loop;
|
||||
mod observability;
|
||||
mod tools;
|
||||
mod v1;
|
||||
@ -68,62 +67,14 @@ async fn main() {
|
||||
let access = access::AccessControl::new(config.auth.enabled);
|
||||
access.register_defaults().await;
|
||||
|
||||
// Phase 42 — file-backed truth rules. Probes the `truth/` directory
|
||||
// at repo root (or $LAKEHOUSE_TRUTH_DIR override) and logs how many
|
||||
// rules load. Current request paths still build their own stores
|
||||
// via truth::default_truth_store() / truth::sql_query_guard_store();
|
||||
// the composed-at-boot store gets plumbed through V1State in a
|
||||
// follow-up. This boot probe catches parse errors + duplicate-ID
|
||||
// collisions early rather than at first request.
|
||||
{
|
||||
let truth_dir = std::env::var("LAKEHOUSE_TRUTH_DIR")
|
||||
.unwrap_or_else(|_| "/home/profit/lakehouse/truth".to_string());
|
||||
if std::path::Path::new(&truth_dir).exists() {
|
||||
let mut probe_store = truth::default_truth_store();
|
||||
match truth::loader::load_from_dir(&mut probe_store, &truth_dir) {
|
||||
Ok(n) => tracing::info!("truth: loaded {n} file-backed rule(s) from {truth_dir}"),
|
||||
Err(e) => tracing::warn!("truth: failed to load rules from {truth_dir}: {e}"),
|
||||
}
|
||||
} else {
|
||||
tracing::debug!("truth: no rule dir at {truth_dir}, skipping file-backed load");
|
||||
}
|
||||
}
|
||||
|
||||
// Workspace manager for agent-specific overlays
|
||||
let workspace_mgr = queryd::workspace::WorkspaceManager::new(store.clone());
|
||||
if let Err(e) = workspace_mgr.rebuild().await {
|
||||
tracing::warn!("workspace rebuild: {e}");
|
||||
}
|
||||
|
||||
// AI sidecar clients — Phase 44 part 3 (2026-04-27).
|
||||
//
|
||||
// Two flavors of the same client:
|
||||
// - `ai_client_direct` posts directly to ${sidecar}/generate. Used
|
||||
// inside the gateway by V1State + the legacy /ai proxy. These
|
||||
// call sites are themselves the implementation of /v1/chat
|
||||
// (or its sidecar shim), so routing them through /v1/chat
|
||||
// would self-loop.
|
||||
// - `ai_client_observable` posts via ${gateway}/v1/chat with
|
||||
// provider="ollama". Used by vectord modules (autotune agent,
|
||||
// /vectors service) so their LLM calls land in /v1/usage and
|
||||
// Langfuse traces. Adds one localhost HTTP hop per call (~ms);
|
||||
// accepted for the observability gain.
|
||||
//
|
||||
// The gateway can call its own /v1/chat over localhost during
|
||||
// boot's transient period because we don't fire any LLM calls
|
||||
// until the listener is up — the observable client is just
|
||||
// configured here, not exercised.
|
||||
let ai_client_direct = aibridge::client::AiClient::new(&config.sidecar.url);
|
||||
let gateway_self_url = format!("http://{}:{}", config.gateway.host, config.gateway.port);
|
||||
let ai_client_observable = aibridge::client::AiClient::new_with_gateway(
|
||||
&config.sidecar.url,
|
||||
&gateway_self_url,
|
||||
);
|
||||
// Backwards-compat alias for the (many) existing references in this file.
|
||||
// Defaults to direct so the existing wiring (V1State, /ai proxy)
|
||||
// keeps its non-self-loop transport. New vectord wiring below
|
||||
// explicitly uses ai_client_observable.
|
||||
let ai_client = ai_client_direct.clone();
|
||||
// AI sidecar client
|
||||
let ai_client = aibridge::client::AiClient::new(&config.sidecar.url);
|
||||
|
||||
// Vector service components — built before the router because both the
|
||||
// /vectors service AND ingestd need the agent handle to enqueue triggers.
|
||||
@ -141,12 +92,6 @@ async fn main() {
|
||||
// operators call POST /vectors/playbook_memory/rebuild to populate.
|
||||
let pbm = vectord::playbook_memory::PlaybookMemory::new(store.clone());
|
||||
let _ = pbm.load_from_storage().await;
|
||||
// Pathway memory — consensus-designed sidecar for full-context
|
||||
// backtracking + hot-swap of successful review pathways. Same
|
||||
// load-on-boot pattern as playbook_memory: empty state is fine,
|
||||
// operators start populating via scrum_master_pipeline.ts.
|
||||
let pwm = vectord::pathway_memory::PathwayMemory::new(store.clone());
|
||||
let _ = pwm.load_from_storage().await;
|
||||
|
||||
// Phase 16.2: spawn the autotune agent. When config.agent.enabled=false
|
||||
// this returns a handle that drops triggers silently — no surprise load.
|
||||
@ -161,9 +106,7 @@ async fn main() {
|
||||
agent_cfg,
|
||||
vectord::agent::AgentDeps {
|
||||
store: store.clone(),
|
||||
// Observable: autotune agent's LLM calls go through
|
||||
// /v1/chat for /v1/usage + Langfuse visibility.
|
||||
ai_client: ai_client_observable.clone(),
|
||||
ai_client: ai_client.clone(),
|
||||
catalog: registry.clone(),
|
||||
index_registry: index_reg.clone(),
|
||||
hnsw_store: hnsw.clone(),
|
||||
@ -210,17 +153,10 @@ async fn main() {
|
||||
agent_handle: agent_handle.clone(),
|
||||
index_registry: index_reg.clone(),
|
||||
schedules: sched_store,
|
||||
// P9-001 fix 2026-04-23: journal reference flows into ingest so
|
||||
// successful uploads emit a record_ingest event. Journal is Clone
|
||||
// (Arc<RwLock> inside) so the /journal nest below still sees the
|
||||
// same buffer + persistence.
|
||||
journal: Some(journal.clone()),
|
||||
}))
|
||||
.nest("/vectors", vectord::service::router(vectord::service::VectorState {
|
||||
store: store.clone(),
|
||||
// Observable: /vectors service's LLM calls (RAG, summary,
|
||||
// playbook synthesis, etc.) flow through /v1/chat.
|
||||
ai_client: ai_client_observable.clone(),
|
||||
ai_client: ai_client.clone(),
|
||||
job_tracker: vectord::jobs::JobTracker::new(),
|
||||
index_registry: index_reg.clone(),
|
||||
hnsw_store: hnsw,
|
||||
@ -236,19 +172,17 @@ async fn main() {
|
||||
bucket_registry.clone(), index_reg.clone(),
|
||||
),
|
||||
playbook_memory: pbm,
|
||||
pathway_memory: pwm,
|
||||
embed_semaphore: std::sync::Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
}))
|
||||
.nest("/workspaces", queryd::workspace_service::router(workspace_mgr))
|
||||
.nest("/journal", journald::service::router(journal))
|
||||
.nest("/access", access_service::router(access))
|
||||
.nest("/tools", tools::service::router({
|
||||
let tool_reg = tools::registry::ToolRegistry::new();
|
||||
let tool_reg = tools::registry::ToolRegistry::new_with_defaults();
|
||||
tool_reg.register_defaults().await;
|
||||
tools::ToolState {
|
||||
registry: tool_reg,
|
||||
query_fn: tools::QueryExecutor::new(engine.clone()),
|
||||
truth: std::sync::Arc::new(truth::sql_query_guard_store()),
|
||||
}
|
||||
}))
|
||||
// Phase 38 — Universal API skeleton. Thin OpenAI-compatible
|
||||
@ -270,86 +204,6 @@ async fn main() {
|
||||
}
|
||||
k
|
||||
},
|
||||
openrouter_key: {
|
||||
// 2026-04-24 free-tier rescue rung for iter 5+. Shares
|
||||
// the LLM Team UI's OPENROUTER_API_KEY so both systems
|
||||
// draw from one quota.
|
||||
let k = v1::openrouter::resolve_openrouter_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: OpenRouter key loaded — /v1/chat provider=openrouter enabled");
|
||||
} else {
|
||||
tracing::warn!("v1: no OpenRouter key — openrouter rescue rung will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
gemini_key: {
|
||||
// Phase 40 provider. GEMINI_API_KEY in env or .env.
|
||||
let k = v1::gemini::resolve_gemini_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: Gemini key loaded — /v1/chat provider=gemini enabled");
|
||||
} else {
|
||||
tracing::debug!("v1: no Gemini key — provider=gemini will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
claude_key: {
|
||||
// Phase 40 provider. ANTHROPIC_API_KEY in env or .env.
|
||||
let k = v1::claude::resolve_claude_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: Claude key loaded — /v1/chat provider=claude enabled");
|
||||
} else {
|
||||
tracing::debug!("v1: no Claude key — provider=claude will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
kimi_key: {
|
||||
// Direct Kimi For Coding (api.kimi.com) — bypasses the
|
||||
// broken-upstream kimi-k2:1t and OpenRouter rate caps.
|
||||
// Key from /etc/lakehouse/kimi.env (KIMI_API_KEY=sk-kimi-…).
|
||||
let k = v1::kimi::resolve_kimi_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: Kimi key loaded — /v1/chat provider=kimi enabled (model=kimi-for-coding)");
|
||||
} else {
|
||||
tracing::debug!("v1: no Kimi key — provider=kimi will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
opencode_key: {
|
||||
// OpenCode GO multi-vendor gateway — Claude Opus 4.7,
|
||||
// GPT-5.5-pro, Gemini 3.1-pro, Kimi K2.6, DeepSeek, GLM,
|
||||
// Qwen + free-tier. Key from /etc/lakehouse/opencode.env.
|
||||
let k = v1::opencode::resolve_opencode_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: OpenCode key loaded — /v1/chat provider=opencode enabled (40 models)");
|
||||
} else {
|
||||
tracing::debug!("v1: no OpenCode key — provider=opencode will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
validate_workers: {
|
||||
// Load workers_500k.parquet snapshot for /v1/validate.
|
||||
// Path overridable via LH_WORKERS_PARQUET env. Missing
|
||||
// file is non-fatal — validators run schema/PII checks
|
||||
// unaffected; only worker-existence checks fail clean.
|
||||
let path_str = std::env::var("LH_WORKERS_PARQUET")
|
||||
.unwrap_or_else(|_| "/home/profit/lakehouse/data/datasets/workers_500k.parquet".into());
|
||||
let path = std::path::Path::new(&path_str);
|
||||
if path.exists() {
|
||||
match validator::staffing::parquet_lookup::load_workers_parquet(path) {
|
||||
Ok(lookup) => {
|
||||
tracing::info!("v1: workers parquet loaded from {} — /v1/validate worker-existence checks enabled", path_str);
|
||||
lookup
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("v1: workers parquet at {} unreadable ({e}) — /v1/validate worker-existence checks will fail Consistency", path_str);
|
||||
std::sync::Arc::new(validator::InMemoryWorkerLookup::new())
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::warn!("v1: workers parquet at {} not found — /v1/validate worker-existence checks will fail Consistency", path_str);
|
||||
std::sync::Arc::new(validator::InMemoryWorkerLookup::new())
|
||||
}
|
||||
},
|
||||
// Phase 40 early deliverable — Langfuse trace emitter.
|
||||
// Defaults match mcp-server/tracing.ts conventions so
|
||||
// gateway traces land in the same staffing project.
|
||||
@ -362,37 +216,16 @@ async fn main() {
|
||||
}
|
||||
c
|
||||
},
|
||||
// Coordinator session JSONL — one row per /v1/iterate
|
||||
// session for offline DuckDB analysis. Cross-runtime
|
||||
// parity with Go-side validatord (commit 1a3a82a).
|
||||
session_log: {
|
||||
let path = &config.gateway.session_log_path;
|
||||
let s = v1::session_log::SessionLogger::from_path(path);
|
||||
if s.is_some() {
|
||||
tracing::info!(
|
||||
"v1: session log enabled — coordinator sessions written to {}",
|
||||
path
|
||||
);
|
||||
} else {
|
||||
tracing::info!("v1: session log disabled (set [gateway].session_log_path to enable)");
|
||||
}
|
||||
s
|
||||
},
|
||||
}));
|
||||
|
||||
// Auth middleware (if enabled) — P5-001 fix 2026-04-23:
|
||||
// previously only inserted the ApiKey as an extension and never layered
|
||||
// the middleware, so auth.enabled=true enforced nothing. Now wraps the
|
||||
// router with from_fn_with_state, which calls api_key_auth on every
|
||||
// request. /health is exempted inside the middleware (LB probes).
|
||||
// Auth middleware (if enabled)
|
||||
if config.auth.enabled {
|
||||
if let Some(ref key) = config.auth.api_key {
|
||||
tracing::info!("API key auth enabled — enforcing on all routes except /health");
|
||||
tracing::info!("API key auth enabled");
|
||||
let api_key = auth::ApiKey(key.clone());
|
||||
app = app.layer(axum::middleware::from_fn_with_state(
|
||||
api_key,
|
||||
auth::api_key_auth,
|
||||
));
|
||||
app = app.layer(axum::Extension(api_key));
|
||||
// Note: auth middleware applied per-route in production
|
||||
// For now, the ApiKey extension is available for handlers to check
|
||||
} else {
|
||||
tracing::warn!("auth enabled but no api_key set — all requests allowed");
|
||||
}
|
||||
|
||||
@ -3,18 +3,12 @@ pub mod service;
|
||||
|
||||
use queryd::context::QueryEngine;
|
||||
use arrow::json::writer::{JsonArray, Writer as JsonWriter};
|
||||
use std::sync::Arc;
|
||||
use truth::TruthStore;
|
||||
|
||||
/// State for the tool system.
|
||||
#[derive(Clone)]
|
||||
pub struct ToolState {
|
||||
pub registry: registry::ToolRegistry,
|
||||
pub query_fn: QueryExecutor,
|
||||
/// SQL guard (shared with queryd). Mirrors the queryd /sql truth
|
||||
/// gate from P42-002 (9cc0ceb) — tools also execute model-
|
||||
/// originated SQL, need the same destructive-verb block.
|
||||
pub truth: Arc<TruthStore>,
|
||||
}
|
||||
|
||||
/// Wraps QueryEngine to provide a simple execute interface for tools.
|
||||
|
||||
@ -67,14 +67,24 @@ pub struct ToolRegistry {
|
||||
}
|
||||
|
||||
impl ToolRegistry {
|
||||
/// Build an empty registry. Callers in an async context should follow
|
||||
/// this with `.register_defaults().await` if they want the built-in
|
||||
/// staffing tools pre-installed — main.rs does exactly that.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
let registry = Self {
|
||||
tools: Arc::new(RwLock::new(HashMap::new())),
|
||||
audit_log: Arc::new(RwLock::new(Vec::new())),
|
||||
}
|
||||
};
|
||||
// Register built-in staffing tools
|
||||
tokio::task::block_in_place(|| {
|
||||
tokio::runtime::Handle::current().block_on(registry.register_defaults())
|
||||
});
|
||||
registry
|
||||
}
|
||||
|
||||
pub fn new_with_defaults() -> Self {
|
||||
let registry = Self {
|
||||
tools: Arc::new(RwLock::new(HashMap::new())),
|
||||
audit_log: Arc::new(RwLock::new(Vec::new())),
|
||||
};
|
||||
registry
|
||||
}
|
||||
|
||||
/// Register default staffing tools.
|
||||
|
||||
@ -7,7 +7,7 @@ use axum::{
|
||||
};
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::registry::{ToolInvocation, ToolRegistry};
|
||||
use super::registry::{Permission, ToolInvocation, ToolRegistry};
|
||||
use crate::tools::ToolState;
|
||||
|
||||
pub fn router(state: ToolState) -> Router {
|
||||
@ -92,32 +92,6 @@ async fn call_tool(
|
||||
}
|
||||
};
|
||||
|
||||
// Truth gate — same contract as queryd /sql (P42-002). Rejects
|
||||
// destructive verbs + empty SQL. Scrum iter 11 CF-1 + CF-2 on this
|
||||
// file: tools executed model-provided SQL parameters without any
|
||||
// validation. Close the gap here so the parallel surface has the
|
||||
// same safety floor as queryd.
|
||||
let ctx = serde_json::json!({ "sql": sql });
|
||||
for outcome in state.truth.evaluate("sql_query", &ctx) {
|
||||
if outcome.passed {
|
||||
if let truth::RuleAction::Reject { message } | truth::RuleAction::Block { message } = &outcome.action {
|
||||
tracing::warn!("tool {name}: SQL blocked by truth gate ({}): {message}", outcome.rule_id);
|
||||
state.registry.log_invocation(ToolInvocation {
|
||||
id: format!("inv-{}", chrono::Utc::now().timestamp_millis()),
|
||||
tool_name: name.clone(),
|
||||
agent: req.agent.clone(),
|
||||
params: req.params.clone(),
|
||||
permission: tool.permission.clone(),
|
||||
timestamp: chrono::Utc::now(),
|
||||
success: false,
|
||||
error: Some(format!("truth gate: {message}")),
|
||||
rows_returned: None,
|
||||
}).await;
|
||||
return Err((StatusCode::FORBIDDEN, message.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Execute via query engine
|
||||
let result = state.query_fn.execute(&sql).await;
|
||||
|
||||
|
||||
@ -1,222 +0,0 @@
|
||||
//! Claude (Anthropic) adapter.
|
||||
//!
|
||||
//! POST `https://api.anthropic.com/v1/messages`. Auth via `x-api-key`
|
||||
//! header (not bearer) + required `anthropic-version` header. Payload
|
||||
//! is NOT OpenAI-compatible — response text lives at
|
||||
//! `content[0].text`. Phase 40 deliverable. System prompts travel in
|
||||
//! a top-level `system` field, separate from the `messages` array.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
const CLAUDE_BASE_URL: &str = "https://api.anthropic.com/v1";
|
||||
const CLAUDE_API_VERSION: &str = "2023-06-01";
|
||||
const CLAUDE_TIMEOUT_SECS: u64 = 180;
|
||||
|
||||
pub fn resolve_claude_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("ANTHROPIC_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
for path in ["/home/profit/.env", "/root/.env"] {
|
||||
if let Ok(raw) = std::fs::read_to_string(path) {
|
||||
for line in raw.lines() {
|
||||
if let Some(rest) = line.strip_prefix("ANTHROPIC_API_KEY=") {
|
||||
let k = rest.trim().trim_matches('"').trim_matches('\'');
|
||||
if !k.is_empty() { return Some(k.to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
// Strip the "claude/" prefix if the caller used the namespaced form.
|
||||
let model = req.model.strip_prefix("claude/").unwrap_or(&req.model).to_string();
|
||||
|
||||
// Anthropic carries system prompts outside the messages array.
|
||||
// Concatenate any system-role messages into a single system string;
|
||||
// keep user + assistant messages in `messages`.
|
||||
let mut system_parts: Vec<String> = Vec::new();
|
||||
let mut msgs: Vec<AnMessage> = Vec::new();
|
||||
for m in &req.messages {
|
||||
if m.role == "system" {
|
||||
system_parts.push(m.text());
|
||||
} else {
|
||||
// Anthropic expects strictly "user" or "assistant"; anything
|
||||
// else we normalize to "user".
|
||||
let role = if m.role == "assistant" { "assistant" } else { "user" };
|
||||
msgs.push(AnMessage { role: role.to_string(), content: m.text() });
|
||||
}
|
||||
}
|
||||
let system = if system_parts.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(system_parts.join("\n\n"))
|
||||
};
|
||||
|
||||
let body = AnChatBody {
|
||||
model: model.clone(),
|
||||
messages: msgs,
|
||||
max_tokens: req.max_tokens.unwrap_or(800),
|
||||
temperature: req.temperature.unwrap_or(0.3),
|
||||
system,
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(CLAUDE_TIMEOUT_SECS))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(format!("{}/messages", CLAUDE_BASE_URL))
|
||||
.header("x-api-key", key)
|
||||
.header("anthropic-version", CLAUDE_API_VERSION)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("api.anthropic.com unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("claude {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: AnChatResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid claude response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let text = parsed.content.into_iter()
|
||||
.find(|b| b.block_type == "text")
|
||||
.map(|b| b.text)
|
||||
.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.input_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.output_tokens).unwrap_or_else(|| {
|
||||
((text.chars().count() + 3) / 4) as u32
|
||||
});
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "claude",
|
||||
model = %model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"claude chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message::new_text("assistant", text),
|
||||
finish_reason: parsed.stop_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- Anthropic Messages API wire shapes --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct AnChatBody {
|
||||
model: String,
|
||||
messages: Vec<AnMessage>,
|
||||
max_tokens: u32,
|
||||
temperature: f64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
system: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct AnMessage { role: String, content: String }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AnChatResponse {
|
||||
content: Vec<AnContentBlock>,
|
||||
#[serde(default, rename = "stop_reason")]
|
||||
stop_reason: Option<String>,
|
||||
#[serde(default)]
|
||||
usage: Option<AnUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AnContentBlock {
|
||||
#[serde(rename = "type")]
|
||||
block_type: String,
|
||||
#[serde(default)]
|
||||
text: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AnUsage { input_tokens: u32, output_tokens: u32 }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_claude_key_does_not_panic() {
|
||||
let _ = resolve_claude_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_body_serializes_with_separate_system() {
|
||||
let body = AnChatBody {
|
||||
model: "claude-3-5-sonnet-latest".into(),
|
||||
messages: vec![
|
||||
AnMessage { role: "user".into(), content: "hi".into() },
|
||||
],
|
||||
max_tokens: 800,
|
||||
temperature: 0.3,
|
||||
system: Some("You are helpful.".into()),
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(json.contains("\"system\":\"You are helpful.\""));
|
||||
assert!(json.contains("\"messages\""));
|
||||
assert!(json.contains("\"max_tokens\":800"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn body_omits_system_when_none() {
|
||||
let body = AnChatBody {
|
||||
model: "claude-3-5-sonnet-latest".into(),
|
||||
messages: vec![AnMessage { role: "user".into(), content: "hi".into() }],
|
||||
max_tokens: 800,
|
||||
temperature: 0.3,
|
||||
system: None,
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(!json.contains("\"system\""), "system field should be skipped when None: {json}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_prefix_strip_preserves_bare_names() {
|
||||
let cases = [
|
||||
("claude/claude-3-5-sonnet-latest", "claude-3-5-sonnet-latest"),
|
||||
("claude-3-5-sonnet-latest", "claude-3-5-sonnet-latest"),
|
||||
];
|
||||
for (input, expected) in cases {
|
||||
let out = input.strip_prefix("claude/").unwrap_or(input);
|
||||
assert_eq!(out, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,230 +0,0 @@
|
||||
//! Gemini adapter — Google's Generative Language API.
|
||||
//!
|
||||
//! POST `https://generativelanguage.googleapis.com/v1beta/models/
|
||||
//! {model}:generateContent?key=<API_KEY>`. Auth via query-string key
|
||||
//! (not bearer). Payload shape is NOT OpenAI-compatible — we map
|
||||
//! messages → contents + parts, extract response from `candidates[0]
|
||||
//! .content.parts[0].text`. Phase 40 deliverable; gate: `/v1/chat`
|
||||
//! with a prefixed or explicit gemini model returns normally.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
const GEMINI_BASE_URL: &str = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const GEMINI_TIMEOUT_SECS: u64 = 180;
|
||||
|
||||
pub fn resolve_gemini_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("GEMINI_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
for path in ["/home/profit/.env", "/root/.env"] {
|
||||
if let Ok(raw) = std::fs::read_to_string(path) {
|
||||
for line in raw.lines() {
|
||||
if let Some(rest) = line.strip_prefix("GEMINI_API_KEY=") {
|
||||
let k = rest.trim().trim_matches('"').trim_matches('\'');
|
||||
if !k.is_empty() { return Some(k.to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
// Strip the "gemini/" prefix if the caller used the namespaced form.
|
||||
let model = req.model.strip_prefix("gemini/").unwrap_or(&req.model).to_string();
|
||||
|
||||
// Gemini splits system prompt from conversation differently.
|
||||
// Simplest working mapping: concatenate any system messages at the
|
||||
// top of a single user turn, then append user/assistant turns as
|
||||
// separate contents entries. Covers the common single-turn case
|
||||
// the scrum pipeline uses.
|
||||
let mut contents: Vec<GmContent> = Vec::new();
|
||||
for m in &req.messages {
|
||||
let role = match m.role.as_str() {
|
||||
"system" | "user" => "user",
|
||||
_ => "model",
|
||||
};
|
||||
contents.push(GmContent {
|
||||
role: role.to_string(),
|
||||
parts: vec![GmPart { text: m.text() }],
|
||||
});
|
||||
}
|
||||
|
||||
let body = GmChatBody {
|
||||
contents,
|
||||
generation_config: GmGenerationConfig {
|
||||
temperature: req.temperature.unwrap_or(0.3),
|
||||
max_output_tokens: req.max_tokens.unwrap_or(800),
|
||||
},
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(GEMINI_TIMEOUT_SECS))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let url = format!("{}/models/{}:generateContent?key={}", GEMINI_BASE_URL, model, key);
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(&url)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("generativelanguage.googleapis.com unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("gemini {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: GmChatResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid gemini response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let candidate = parsed.candidates.into_iter().next()
|
||||
.ok_or_else(|| "gemini returned no candidates".to_string())?;
|
||||
let text = candidate.content.parts.into_iter()
|
||||
.next()
|
||||
.map(|p| p.text)
|
||||
.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.usage_metadata.as_ref()
|
||||
.map(|u| u.prompt_token_count)
|
||||
.unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage_metadata.as_ref()
|
||||
.map(|u| u.candidates_token_count)
|
||||
.unwrap_or_else(|| ((text.chars().count() + 3) / 4) as u32);
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "gemini",
|
||||
model = %model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"gemini chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message::new_text("assistant", text),
|
||||
finish_reason: candidate.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- Gemini wire shapes --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GmChatBody {
|
||||
contents: Vec<GmContent>,
|
||||
#[serde(rename = "generationConfig")]
|
||||
generation_config: GmGenerationConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GmContent {
|
||||
role: String,
|
||||
parts: Vec<GmPart>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GmPart { text: String }
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct GmGenerationConfig {
|
||||
temperature: f64,
|
||||
max_output_tokens: u32,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct GmChatResponse {
|
||||
candidates: Vec<GmCandidate>,
|
||||
#[serde(default, rename = "usageMetadata")]
|
||||
usage_metadata: Option<GmUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct GmCandidate {
|
||||
content: GmContentResp,
|
||||
#[serde(default, rename = "finishReason")]
|
||||
finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct GmContentResp { parts: Vec<GmPartResp> }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct GmPartResp { #[serde(default)] text: String }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct GmUsage {
|
||||
prompt_token_count: u32,
|
||||
candidates_token_count: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_gemini_key_does_not_panic() {
|
||||
let _ = resolve_gemini_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_body_serializes_to_gemini_shape() {
|
||||
let body = GmChatBody {
|
||||
contents: vec![
|
||||
GmContent {
|
||||
role: "user".into(),
|
||||
parts: vec![GmPart { text: "hello".into() }],
|
||||
},
|
||||
],
|
||||
generation_config: GmGenerationConfig {
|
||||
temperature: 0.3,
|
||||
max_output_tokens: 800,
|
||||
},
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(json.contains("\"contents\""));
|
||||
assert!(json.contains("\"parts\""));
|
||||
// camelCase per Gemini API
|
||||
assert!(json.contains("\"generationConfig\""));
|
||||
assert!(json.contains("\"maxOutputTokens\":800"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_prefix_strip_preserves_bare_names() {
|
||||
let cases = [
|
||||
("gemini/gemini-2.0-flash", "gemini-2.0-flash"),
|
||||
("gemini-2.0-flash", "gemini-2.0-flash"),
|
||||
];
|
||||
for (input, expected) in cases {
|
||||
let out = input.strip_prefix("gemini/").unwrap_or(input);
|
||||
assert_eq!(out, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,543 +0,0 @@
|
||||
//! /v1/iterate — the Phase 43 PRD's "generate → validate → correct → retry" loop.
|
||||
//!
|
||||
//! Closes the "0→85% with iteration" thesis structurally. A caller
|
||||
//! posts a prompt + artifact kind + validation context; the gateway:
|
||||
//! 1. Generates a JSON artifact via /v1/chat (any provider/model)
|
||||
//! 2. Extracts the JSON object from the model output
|
||||
//! 3. Validates via /v1/validate (FillValidator / EmailValidator /
|
||||
//! PlaybookValidator with the shared WorkerLookup)
|
||||
//! 4. On ValidationError, appends the error to the prompt and
|
||||
//! retries up to `max_iterations` (default 3)
|
||||
//! 5. Returns the accepted artifact + Report on success, OR the
|
||||
//! attempt history + final error on max-iter exhaustion
|
||||
//!
|
||||
//! Internal calls go via HTTP loopback to localhost:gateway_port so
|
||||
//! the same /v1/usage tracking and Langfuse traces apply. A small
|
||||
//! latency cost (~1-3ms per loopback hop) for clean separation of
|
||||
//! concerns and observability.
|
||||
//!
|
||||
//! 2026-04-27 Phase 43 v3 part 3: this endpoint makes the iteration
|
||||
//! loop a first-class lakehouse capability rather than a per-caller
|
||||
//! re-implementation. Staffing executors, agent loops, and future
|
||||
//! validators all reach the same code path.
|
||||
|
||||
use axum::{extract::State, http::{HeaderMap, StatusCode}, response::IntoResponse, Json};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
const DEFAULT_MAX_ITERATIONS: u32 = 3;
|
||||
const LOOPBACK_TIMEOUT_SECS: u64 = 240;
|
||||
|
||||
/// Header name used to propagate a Langfuse parent trace id across
|
||||
/// daemon boundaries. Matches Go's `shared.TraceIDHeader` constant
|
||||
/// byte-for-byte (commit d6d2fdf in golangLAKEHOUSE) — same wire
|
||||
/// format means a Go caller can hit Rust's /v1/iterate (or vice
|
||||
/// versa) and the resulting Langfuse trees nest correctly.
|
||||
pub const TRACE_ID_HEADER: &str = "x-lakehouse-trace-id";
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct IterateRequest {
|
||||
/// "fill" | "email" | "playbook" — picks which validator runs.
|
||||
pub kind: String,
|
||||
/// The prompt to seed generation. Validation errors from prior
|
||||
/// attempts are appended on retry.
|
||||
pub prompt: String,
|
||||
/// Provider/model passed through to /v1/chat. e.g. "ollama_cloud"
|
||||
/// + "kimi-k2.6", or "opencode" + "claude-haiku-4-5".
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
/// Optional system prompt — sent to /v1/chat as the system message.
|
||||
#[serde(default)]
|
||||
pub system: Option<String>,
|
||||
/// Validation context (target_count, city, state, role, client_id
|
||||
/// for fills; candidate_id for emails). Forwarded to /v1/validate.
|
||||
#[serde(default)]
|
||||
pub context: Option<serde_json::Value>,
|
||||
/// Cap on iteration count. Defaults to 3 per the Phase 43 PRD.
|
||||
#[serde(default)]
|
||||
pub max_iterations: Option<u32>,
|
||||
/// Forwarded to /v1/chat. Defaults to 0.2 if unset.
|
||||
#[serde(default)]
|
||||
pub temperature: Option<f64>,
|
||||
/// Forwarded to /v1/chat. Defaults to 4096 if unset.
|
||||
#[serde(default)]
|
||||
pub max_tokens: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct IterateAttempt {
|
||||
pub iteration: u32,
|
||||
pub raw: String,
|
||||
pub status: AttemptStatus,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum AttemptStatus {
|
||||
/// Model output didn't contain extractable JSON.
|
||||
NoJson,
|
||||
/// JSON extracted but failed validation; carries the error.
|
||||
ValidationFailed { error: serde_json::Value },
|
||||
/// Validation passed (last attempt's terminal status).
|
||||
Accepted,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct IterateResponse {
|
||||
pub artifact: serde_json::Value,
|
||||
pub validation: serde_json::Value,
|
||||
pub iterations: u32,
|
||||
pub history: Vec<IterateAttempt>,
|
||||
/// Echoes the resolved trace id (caller-forwarded header, body
|
||||
/// field, langfuse-middleware mint, or local fallback). Operators
|
||||
/// pivot from this id straight into Langfuse + the
|
||||
/// coordinator_sessions.jsonl join key. Cross-runtime parity with
|
||||
/// Go's `validator.IterateResponse` (commit 6847bbc in
|
||||
/// golangLAKEHOUSE).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub trace_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct IterateFailure {
|
||||
pub error: String,
|
||||
pub iterations: u32,
|
||||
pub history: Vec<IterateAttempt>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub trace_id: Option<String>,
|
||||
}
|
||||
|
||||
pub async fn iterate(
|
||||
State(state): State<super::V1State>,
|
||||
headers: HeaderMap,
|
||||
Json(req): Json<IterateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
let max_iter = req.max_iterations.unwrap_or(DEFAULT_MAX_ITERATIONS).max(1);
|
||||
let temperature = req.temperature.unwrap_or(0.2);
|
||||
let max_tokens = req.max_tokens.unwrap_or(4096);
|
||||
let mut history: Vec<IterateAttempt> = Vec::with_capacity(max_iter as usize);
|
||||
let mut attempt_records: Vec<super::session_log::SessionAttemptRecord> = Vec::with_capacity(max_iter as usize);
|
||||
let mut current_prompt = req.prompt.clone();
|
||||
|
||||
// Resolve the parent Langfuse trace id. Caller-forwarded header
|
||||
// wins (cross-daemon tree linkage); otherwise mint a fresh id so
|
||||
// the iterate session is its own tree. Same shape as the Go-side
|
||||
// validatord trace propagation.
|
||||
let trace_id: String = headers
|
||||
.get(TRACE_ID_HEADER)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(new_trace_id);
|
||||
|
||||
let client = match reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(LOOPBACK_TIMEOUT_SECS))
|
||||
.build() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
// Even infrastructure failures get a session row so a
|
||||
// missing /v1/iterate event never silently disappears
|
||||
// from the longitudinal log.
|
||||
write_infra_error(&state, &req, &trace_id, max_iter, 0, format!("client build: {e}")).await;
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("client build: {e}")).into_response();
|
||||
}
|
||||
};
|
||||
// Self-loopback to the gateway port. Carries gateway internal
|
||||
// calls through /v1/chat + /v1/validate so /v1/usage tracks them.
|
||||
let gateway = "http://127.0.0.1:3100";
|
||||
let t0 = std::time::Instant::now();
|
||||
|
||||
for iteration in 0..max_iter {
|
||||
let attempt_started = chrono::Utc::now();
|
||||
// ── Generate ──
|
||||
let mut messages = Vec::with_capacity(2);
|
||||
if let Some(sys) = &req.system {
|
||||
messages.push(serde_json::json!({"role": "system", "content": sys}));
|
||||
}
|
||||
messages.push(serde_json::json!({"role": "user", "content": current_prompt}));
|
||||
let chat_body = serde_json::json!({
|
||||
"messages": messages,
|
||||
"provider": req.provider,
|
||||
"model": req.model,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
});
|
||||
let raw = match call_chat(&client, gateway, &chat_body, &trace_id).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
write_infra_error(&state, &req, &trace_id, max_iter, t0.elapsed().as_millis() as u64, format!("/v1/chat hop failed at iter {iteration}: {e}")).await;
|
||||
return (StatusCode::BAD_GATEWAY, format!("/v1/chat hop failed at iter {iteration}: {e}")).into_response();
|
||||
}
|
||||
};
|
||||
|
||||
// ── Extract JSON ──
|
||||
let artifact = match extract_json(&raw) {
|
||||
Some(a) => a,
|
||||
None => {
|
||||
let span_id = emit_attempt_span(
|
||||
&state, &trace_id, iteration, &req, ¤t_prompt, &raw, "no_json", None,
|
||||
attempt_started, chrono::Utc::now(),
|
||||
);
|
||||
history.push(IterateAttempt {
|
||||
iteration,
|
||||
raw: raw.chars().take(2000).collect(),
|
||||
status: AttemptStatus::NoJson,
|
||||
});
|
||||
attempt_records.push(super::session_log::SessionAttemptRecord {
|
||||
iteration,
|
||||
verdict_kind: "no_json".to_string(),
|
||||
error: None,
|
||||
span_id,
|
||||
});
|
||||
current_prompt = format!(
|
||||
"{}\n\nYour previous attempt did not contain a JSON object. Reply with ONLY a valid JSON object matching the requested artifact shape.",
|
||||
req.prompt,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// ── Validate ──
|
||||
let validate_body = serde_json::json!({
|
||||
"kind": req.kind,
|
||||
"artifact": artifact,
|
||||
"context": req.context.clone().unwrap_or(serde_json::Value::Null),
|
||||
});
|
||||
match call_validate(&client, gateway, &validate_body, &trace_id).await {
|
||||
Ok(report) => {
|
||||
let span_id = emit_attempt_span(
|
||||
&state, &trace_id, iteration, &req, ¤t_prompt, &raw, "accepted", None,
|
||||
attempt_started, chrono::Utc::now(),
|
||||
);
|
||||
history.push(IterateAttempt {
|
||||
iteration,
|
||||
raw: raw.chars().take(2000).collect(),
|
||||
status: AttemptStatus::Accepted,
|
||||
});
|
||||
attempt_records.push(super::session_log::SessionAttemptRecord {
|
||||
iteration,
|
||||
verdict_kind: "accepted".to_string(),
|
||||
error: None,
|
||||
span_id,
|
||||
});
|
||||
let duration_ms = t0.elapsed().as_millis() as u64;
|
||||
let grounded = grounded_in_roster(&state, &req.kind, &artifact);
|
||||
write_session_accepted(&state, &req, &trace_id, iteration + 1, max_iter, attempt_records, &artifact, grounded, duration_ms).await;
|
||||
return (StatusCode::OK, Json(IterateResponse {
|
||||
artifact,
|
||||
validation: report,
|
||||
iterations: iteration + 1,
|
||||
history,
|
||||
trace_id: Some(trace_id.clone()),
|
||||
})).into_response();
|
||||
}
|
||||
Err(err) => {
|
||||
let err_summary = err.to_string();
|
||||
let span_id = emit_attempt_span(
|
||||
&state, &trace_id, iteration, &req, ¤t_prompt, &raw, "validation_failed",
|
||||
Some(err_summary.clone()),
|
||||
attempt_started, chrono::Utc::now(),
|
||||
);
|
||||
history.push(IterateAttempt {
|
||||
iteration,
|
||||
raw: raw.chars().take(2000).collect(),
|
||||
status: AttemptStatus::ValidationFailed {
|
||||
error: serde_json::to_value(&err_summary).unwrap_or(serde_json::Value::Null),
|
||||
},
|
||||
});
|
||||
attempt_records.push(super::session_log::SessionAttemptRecord {
|
||||
iteration,
|
||||
verdict_kind: "validation_failed".to_string(),
|
||||
error: Some(err_summary.clone()),
|
||||
span_id,
|
||||
});
|
||||
// Append validation feedback to prompt for next iter.
|
||||
// The model sees concrete failure mode + retries with
|
||||
// corrective context. This is the "observer correction"
|
||||
// in Phase 43 PRD shape, simplified — the validator
|
||||
// itself IS the observer for now.
|
||||
current_prompt = format!(
|
||||
"{}\n\nPrior attempt failed validation:\n{}\n\nFix the specific issue above and respond with a corrected JSON object.",
|
||||
req.prompt, err_summary,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let duration_ms = t0.elapsed().as_millis() as u64;
|
||||
write_session_failure(&state, &req, &trace_id, max_iter, max_iter, attempt_records, duration_ms).await;
|
||||
(StatusCode::UNPROCESSABLE_ENTITY, Json(IterateFailure {
|
||||
error: format!("max iterations reached ({max_iter}) without passing validation"),
|
||||
iterations: max_iter,
|
||||
history,
|
||||
trace_id: Some(trace_id.clone()),
|
||||
})).into_response()
|
||||
}
|
||||
|
||||
// ─── Helpers — Langfuse spans + session log + roster check ─────────
|
||||
|
||||
fn emit_attempt_span(
|
||||
state: &super::V1State,
|
||||
trace_id: &str,
|
||||
iteration: u32,
|
||||
req: &IterateRequest,
|
||||
prompt: &str,
|
||||
raw: &str,
|
||||
verdict: &str,
|
||||
error: Option<String>,
|
||||
started: chrono::DateTime<chrono::Utc>,
|
||||
ended: chrono::DateTime<chrono::Utc>,
|
||||
) -> Option<String> {
|
||||
let lf = state.langfuse.as_ref()?;
|
||||
Some(lf.emit_attempt_span(super::langfuse_trace::AttemptSpan {
|
||||
trace_id: trace_id.to_string(),
|
||||
iteration,
|
||||
model: req.model.clone(),
|
||||
provider: req.provider.clone(),
|
||||
prompt: prompt.to_string(),
|
||||
raw: raw.to_string(),
|
||||
verdict: verdict.to_string(),
|
||||
error,
|
||||
start_time: started.to_rfc3339(),
|
||||
end_time: ended.to_rfc3339(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Verify every fill artifact's candidate IDs exist in the roster.
|
||||
/// Returns Some(true)/Some(false) on the fill kind, None otherwise
|
||||
/// (other kinds don't have worker IDs to ground). Same semantics as
|
||||
/// Go's `handlers.rosterCheckFor("fill")`.
|
||||
fn grounded_in_roster(
|
||||
state: &super::V1State,
|
||||
kind: &str,
|
||||
artifact: &serde_json::Value,
|
||||
) -> Option<bool> {
|
||||
if kind != "fill" {
|
||||
return None;
|
||||
}
|
||||
let fills = artifact.get("fills").and_then(|v| v.as_array())?;
|
||||
for f in fills {
|
||||
let id = match f.get("candidate_id").and_then(|v| v.as_str()) {
|
||||
Some(s) if !s.is_empty() => s,
|
||||
_ => return Some(false),
|
||||
};
|
||||
if state.validate_workers.find(id).is_none() {
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
Some(true)
|
||||
}
|
||||
|
||||
async fn write_session_accepted(
|
||||
state: &super::V1State,
|
||||
req: &IterateRequest,
|
||||
trace_id: &str,
|
||||
iterations: u32,
|
||||
max_iter: u32,
|
||||
attempts: Vec<super::session_log::SessionAttemptRecord>,
|
||||
artifact: &serde_json::Value,
|
||||
grounded: Option<bool>,
|
||||
duration_ms: u64,
|
||||
) {
|
||||
let Some(logger) = state.session_log.as_ref() else { return };
|
||||
let rec = build_session_record(req, trace_id, "accepted", iterations, max_iter, attempts, Some(artifact.clone()), grounded, duration_ms);
|
||||
logger.append(rec).await;
|
||||
}
|
||||
|
||||
async fn write_session_failure(
|
||||
state: &super::V1State,
|
||||
req: &IterateRequest,
|
||||
trace_id: &str,
|
||||
iterations: u32,
|
||||
max_iter: u32,
|
||||
attempts: Vec<super::session_log::SessionAttemptRecord>,
|
||||
duration_ms: u64,
|
||||
) {
|
||||
let Some(logger) = state.session_log.as_ref() else { return };
|
||||
let rec = build_session_record(req, trace_id, "max_iter_exhausted", iterations, max_iter, attempts, None, None, duration_ms);
|
||||
logger.append(rec).await;
|
||||
}
|
||||
|
||||
async fn write_infra_error(
|
||||
state: &super::V1State,
|
||||
req: &IterateRequest,
|
||||
trace_id: &str,
|
||||
max_iter: u32,
|
||||
duration_ms: u64,
|
||||
error: String,
|
||||
) {
|
||||
let Some(logger) = state.session_log.as_ref() else { return };
|
||||
let attempts = vec![super::session_log::SessionAttemptRecord {
|
||||
iteration: 0,
|
||||
verdict_kind: "infra_error".to_string(),
|
||||
error: Some(error),
|
||||
span_id: None,
|
||||
}];
|
||||
let rec = build_session_record(req, trace_id, "infra_error", 0, max_iter, attempts, None, None, duration_ms);
|
||||
logger.append(rec).await;
|
||||
}
|
||||
|
||||
fn build_session_record(
|
||||
req: &IterateRequest,
|
||||
trace_id: &str,
|
||||
final_verdict: &str,
|
||||
iterations: u32,
|
||||
max_iter: u32,
|
||||
attempts: Vec<super::session_log::SessionAttemptRecord>,
|
||||
artifact: Option<serde_json::Value>,
|
||||
grounded: Option<bool>,
|
||||
duration_ms: u64,
|
||||
) -> super::session_log::SessionRecord {
|
||||
super::session_log::SessionRecord {
|
||||
schema: super::session_log::SESSION_RECORD_SCHEMA.to_string(),
|
||||
session_id: trace_id.to_string(),
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
daemon: "gateway".to_string(),
|
||||
kind: req.kind.clone(),
|
||||
model: req.model.clone(),
|
||||
provider: req.provider.clone(),
|
||||
prompt: super::session_log::truncate(&req.prompt, 4000),
|
||||
iterations,
|
||||
max_iterations: max_iter,
|
||||
final_verdict: final_verdict.to_string(),
|
||||
attempts,
|
||||
artifact,
|
||||
grounded_in_roster: grounded,
|
||||
duration_ms,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a fresh trace id when no parent was forwarded. Same
|
||||
/// time-ordered hex shape Langfuse already accepts elsewhere in this
|
||||
/// crate (see `langfuse_trace::uuid_v7_like`).
|
||||
fn new_trace_id() -> String {
|
||||
let ts = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
|
||||
let rand = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.subsec_nanos())
|
||||
.unwrap_or(0);
|
||||
format!("{:016x}-{:08x}", ts, rand)
|
||||
}
|
||||
|
||||
async fn call_chat(client: &reqwest::Client, gateway: &str, body: &serde_json::Value, trace_id: &str) -> Result<String, String> {
|
||||
let mut req = client.post(format!("{gateway}/v1/chat")).json(body);
|
||||
if !trace_id.is_empty() {
|
||||
req = req.header(TRACE_ID_HEADER, trace_id);
|
||||
}
|
||||
let resp = req.send().await.map_err(|e| format!("chat hop: {e}"))?;
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("chat {}: {}", status, body.chars().take(300).collect::<String>()));
|
||||
}
|
||||
let parsed: serde_json::Value = resp.json().await.map_err(|e| format!("chat parse: {e}"))?;
|
||||
Ok(parsed.pointer("/choices/0/message/content")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string())
|
||||
}
|
||||
|
||||
async fn call_validate(client: &reqwest::Client, gateway: &str, body: &serde_json::Value, trace_id: &str) -> Result<serde_json::Value, String> {
|
||||
let mut req = client.post(format!("{gateway}/v1/validate")).json(body);
|
||||
if !trace_id.is_empty() {
|
||||
req = req.header(TRACE_ID_HEADER, trace_id);
|
||||
}
|
||||
let resp = req.send().await.map_err(|e| format!("validate hop: {e}"))?;
|
||||
let status = resp.status();
|
||||
let parsed: serde_json::Value = resp.json().await.map_err(|e| format!("validate parse: {e}"))?;
|
||||
if status.is_success() {
|
||||
Ok(parsed)
|
||||
} else {
|
||||
// The /v1/validate endpoint returns a ValidationError JSON
|
||||
// on 422; surface its structure verbatim so the prompt-
|
||||
// appending step gets specific failure detail.
|
||||
Err(serde_json::to_string(&parsed).unwrap_or_else(|_| format!("validation {} (unparseable body)", status)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the first JSON object from a model's output. Handles
|
||||
/// fenced code blocks (```json ... ```), bare braces, and stray
|
||||
/// prose around the JSON. Returns None on no extractable object.
|
||||
///
|
||||
/// Made `pub` 2026-05-02 to support the cross-runtime parity probe
|
||||
/// at `golangLAKEHOUSE/scripts/cutover/parity/extract_json_parity.sh`.
|
||||
/// The Go counterpart lives at `internal/validator/iterate.go::ExtractJSON`;
|
||||
/// when either runtime's algorithm changes the parity probe surfaces
|
||||
/// the divergence.
|
||||
pub fn extract_json(raw: &str) -> Option<serde_json::Value> {
|
||||
// Try fenced first.
|
||||
let candidates: Vec<String> = {
|
||||
let mut out = vec![];
|
||||
let mut s = raw;
|
||||
while let Some(start) = s.find("```") {
|
||||
let after = &s[start + 3..];
|
||||
// Skip optional language tag (json, etc.)
|
||||
let body_start = after.find('\n').map(|n| n + 1).unwrap_or(0);
|
||||
let body = &after[body_start..];
|
||||
if let Some(end) = body.find("```") {
|
||||
out.push(body[..end].trim().to_string());
|
||||
s = &body[end + 3..];
|
||||
} else { break; }
|
||||
}
|
||||
out
|
||||
};
|
||||
for c in &candidates {
|
||||
if let Ok(v) = serde_json::from_str::<serde_json::Value>(c) {
|
||||
if v.is_object() { return Some(v); }
|
||||
}
|
||||
}
|
||||
// Fall back to outermost {...} balance.
|
||||
let bytes = raw.as_bytes();
|
||||
let mut depth = 0i32;
|
||||
let mut start: Option<usize> = None;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'{' => { if start.is_none() { start = Some(i); } depth += 1; }
|
||||
b'}' => {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
if let Some(s) = start {
|
||||
let slice = &raw[s..=i];
|
||||
if let Ok(v) = serde_json::from_str::<serde_json::Value>(slice) {
|
||||
if v.is_object() { return Some(v); }
|
||||
}
|
||||
start = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extract_json_from_fenced_block() {
|
||||
let raw = "Here's my answer:\n```json\n{\"fills\": [{\"candidate_id\": \"W-1\"}]}\n```\nDone.";
|
||||
let v = extract_json(raw).unwrap();
|
||||
assert!(v.get("fills").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_json_from_bare_braces() {
|
||||
let raw = "Here you go: {\"fills\": [{\"candidate_id\": \"W-2\"}]}";
|
||||
let v = extract_json(raw).unwrap();
|
||||
assert!(v.get("fills").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_json_returns_none_on_no_object() {
|
||||
assert!(extract_json("just prose, no json").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_json_picks_first_balanced() {
|
||||
let raw = "{\"a\":1} then {\"b\":2}";
|
||||
let v = extract_json(raw).unwrap();
|
||||
assert_eq!(v.get("a").and_then(|v| v.as_i64()), Some(1));
|
||||
}
|
||||
}
|
||||
@ -1,227 +0,0 @@
|
||||
//! Kimi For Coding adapter — direct provider for `kimi-for-coding`
|
||||
//! (kimi-k2.6 underneath). Used when Ollama Cloud's `kimi-k2:1t` is
|
||||
//! returning sustained 5xx (broken upstream) and OpenRouter's
|
||||
//! `moonshotai/kimi-k2.6` is rate-limited.
|
||||
//!
|
||||
//! Endpoint per `kimi.com/code/docs` and `moonshotai.github.io/kimi-cli`:
|
||||
//! base_url: https://api.kimi.com/coding/v1
|
||||
//! model id: kimi-for-coding
|
||||
//! auth: Bearer sk-kimi-…
|
||||
//! protocol: OpenAI Chat Completions compatible
|
||||
//!
|
||||
//! IMPORTANT: `api.kimi.com` is a separate account system from
|
||||
//! `api.moonshot.ai` and `api.moonshot.cn`. Keys are NOT interchangeable.
|
||||
//! This adapter is for `sk-kimi-*` keys provisioned via the Kimi
|
||||
//! membership console only.
|
||||
//!
|
||||
//! Key sourcing priority:
|
||||
//! 1. Env var `KIMI_API_KEY` (loaded from /etc/lakehouse/kimi.env via
|
||||
//! systemd EnvironmentFile=)
|
||||
//! 2. /etc/lakehouse/kimi.env directly (rescue path if env not loaded)
|
||||
//!
|
||||
//! First hit wins. Resolved once at gateway startup, stored on
|
||||
//! `V1State.kimi_key`.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
const KIMI_BASE_URL: &str = "https://api.kimi.com/coding/v1";
|
||||
// Default 600s — kimi-for-coding is a reasoning model; on large
|
||||
// code-audit prompts (~50KB+ input + 8K output) it routinely needs
|
||||
// 3-8 min to think + emit. Override with KIMI_TIMEOUT_SECS env var.
|
||||
const KIMI_TIMEOUT_SECS_DEFAULT: u64 = 600;
|
||||
|
||||
fn kimi_timeout_secs() -> u64 {
|
||||
std::env::var("KIMI_TIMEOUT_SECS")
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok())
|
||||
.filter(|&n| n > 0)
|
||||
.unwrap_or(KIMI_TIMEOUT_SECS_DEFAULT)
|
||||
}
|
||||
|
||||
pub fn resolve_kimi_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("KIMI_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
if let Ok(raw) = std::fs::read_to_string("/etc/lakehouse/kimi.env") {
|
||||
for line in raw.lines() {
|
||||
if let Some(rest) = line.strip_prefix("KIMI_API_KEY=") {
|
||||
let k = rest.trim().trim_matches('"').trim_matches('\'');
|
||||
if !k.is_empty() { return Some(k.to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
// Strip the "kimi/" namespace prefix if the caller used it so the
|
||||
// upstream API sees the bare model id (e.g. "kimi-for-coding").
|
||||
let model = req.model.strip_prefix("kimi/").unwrap_or(&req.model).to_string();
|
||||
|
||||
// Flatten content to a plain String. api.kimi.com is text-only on
|
||||
// the coding endpoint; the OpenAI multimodal array shape
|
||||
// ([{type:"text",text:"..."},{type:"image_url",...}]) returns 400.
|
||||
// Message::text() concats text-parts and drops non-text. Caught
|
||||
// 2026-04-27 by Kimi's self-audit (kimi.rs:137 — content as raw
|
||||
// serde_json::Value risked upstream rejection).
|
||||
let body = KimiChatBody {
|
||||
model: model.clone(),
|
||||
messages: req.messages.iter().map(|m| KimiMessage {
|
||||
role: m.role.clone(),
|
||||
content: serde_json::Value::String(m.text()),
|
||||
}).collect(),
|
||||
max_tokens: req.max_tokens.unwrap_or(800),
|
||||
temperature: req.temperature.unwrap_or(0.3),
|
||||
stream: false,
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(kimi_timeout_secs()))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(format!("{}/chat/completions", KIMI_BASE_URL))
|
||||
.bearer_auth(key)
|
||||
// api.kimi.com gates this endpoint by User-Agent — only sanctioned
|
||||
// coding agents (Claude Code, Kimi CLI, Roo Code, Kilo Code) get
|
||||
// through. Generic clients receive 403 access_terminated_error.
|
||||
// J accepted the TOS risk on 2026-04-27; revisit if Moonshot
|
||||
// tightens enforcement.
|
||||
.header("User-Agent", "claude-code/1.0.0")
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("api.kimi.com unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("api.kimi.com {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: KimiChatResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid kimi response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let choice = parsed.choices.into_iter().next()
|
||||
.ok_or_else(|| "kimi returned no choices".to_string())?;
|
||||
let text = choice.message.content;
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
|
||||
((text.chars().count() + 3) / 4) as u32
|
||||
});
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "kimi",
|
||||
model = %model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"kimi chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
|
||||
finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- Kimi wire shapes (OpenAI-compatible) --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct KimiChatBody {
|
||||
model: String,
|
||||
messages: Vec<KimiMessage>,
|
||||
max_tokens: u32,
|
||||
temperature: f64,
|
||||
stream: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct KimiMessage { role: String, content: serde_json::Value }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct KimiChatResponse {
|
||||
choices: Vec<KimiChoice>,
|
||||
#[serde(default)]
|
||||
usage: Option<KimiUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct KimiChoice {
|
||||
message: KimiMessageResp,
|
||||
#[serde(default)]
|
||||
finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct KimiMessageResp { content: String }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct KimiUsage { prompt_tokens: u32, completion_tokens: u32 }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_kimi_key_does_not_panic() {
|
||||
let _ = resolve_kimi_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_body_serializes_to_openai_shape() {
|
||||
let body = KimiChatBody {
|
||||
model: "kimi-for-coding".into(),
|
||||
messages: vec![
|
||||
KimiMessage { role: "user".into(), content: "review this".into() },
|
||||
],
|
||||
max_tokens: 800,
|
||||
temperature: 0.3,
|
||||
stream: false,
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(json.contains("\"model\":\"kimi-for-coding\""));
|
||||
assert!(json.contains("\"messages\""));
|
||||
assert!(json.contains("\"max_tokens\":800"));
|
||||
assert!(json.contains("\"stream\":false"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_prefix_strip() {
|
||||
let cases = [
|
||||
("kimi/kimi-for-coding", "kimi-for-coding"),
|
||||
("kimi-for-coding", "kimi-for-coding"),
|
||||
("kimi/kimi-k2.6", "kimi-k2.6"),
|
||||
];
|
||||
for (input, expected) in cases {
|
||||
let out = input.strip_prefix("kimi/").unwrap_or(input);
|
||||
assert_eq!(out, expected, "{input} should become {expected}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -76,54 +76,63 @@ impl LangfuseClient {
|
||||
});
|
||||
}
|
||||
|
||||
/// Fire-and-forget per-iteration span emit. Returns the generated
|
||||
/// span id synchronously so the caller can stamp it on
|
||||
/// `IterateAttempt.span_id` before the network round-trip resolves.
|
||||
/// Mirrors Go's `validator.Tracer` callback shape.
|
||||
pub fn emit_attempt_span(&self, sp: AttemptSpan) -> String {
|
||||
let span_id = uuid_v7_like();
|
||||
let span_id_for_caller = span_id.clone();
|
||||
let this = self.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = this.emit_attempt_span_inner(span_id, sp).await {
|
||||
tracing::warn!(target: "v1.langfuse", "iterate span drop: {e}");
|
||||
}
|
||||
});
|
||||
span_id_for_caller
|
||||
}
|
||||
async fn emit_chat_inner(&self, ev: ChatTrace) -> Result<(), String> {
|
||||
let trace_id = uuid_v7_like();
|
||||
let gen_id = uuid_v7_like();
|
||||
let trace_ts = ev.start_time.clone();
|
||||
|
||||
async fn emit_attempt_span_inner(&self, span_id: String, sp: AttemptSpan) -> Result<(), String> {
|
||||
let level = if sp.verdict == "accepted" { "DEFAULT" } else { "WARNING" };
|
||||
let batch = IngestionBatch {
|
||||
batch: vec![IngestionEvent {
|
||||
id: uuid_v7_like(),
|
||||
timestamp: sp.end_time.clone(),
|
||||
kind: "span-create",
|
||||
body: serde_json::json!({
|
||||
"id": span_id,
|
||||
"traceId": sp.trace_id,
|
||||
"name": format!("iterate.attempt[{}]", sp.iteration),
|
||||
"input": serde_json::json!({
|
||||
"iteration": sp.iteration,
|
||||
"model": sp.model,
|
||||
"provider": sp.provider,
|
||||
"prompt": truncate(&sp.prompt, 4000),
|
||||
batch: vec![
|
||||
IngestionEvent {
|
||||
id: uuid_v7_like(),
|
||||
timestamp: trace_ts.clone(),
|
||||
kind: "trace-create",
|
||||
body: serde_json::json!({
|
||||
"id": trace_id,
|
||||
"name": format!("v1.chat:{}", ev.provider),
|
||||
"input": serde_json::json!({
|
||||
"model": ev.model,
|
||||
"messages": ev.input,
|
||||
}),
|
||||
"metadata": serde_json::json!({
|
||||
"provider": ev.provider,
|
||||
"think": ev.think,
|
||||
}),
|
||||
}),
|
||||
"output": serde_json::json!({
|
||||
"verdict": sp.verdict,
|
||||
"error": sp.error,
|
||||
"raw": truncate(&sp.raw, 4000),
|
||||
},
|
||||
IngestionEvent {
|
||||
id: uuid_v7_like(),
|
||||
timestamp: ev.end_time.clone(),
|
||||
kind: "generation-create",
|
||||
body: serde_json::json!({
|
||||
"id": gen_id,
|
||||
"traceId": trace_id,
|
||||
"name": "chat",
|
||||
"model": ev.model,
|
||||
"modelParameters": serde_json::json!({
|
||||
"temperature": ev.temperature,
|
||||
"max_tokens": ev.max_tokens,
|
||||
"think": ev.think,
|
||||
}),
|
||||
"input": ev.input,
|
||||
"output": ev.output,
|
||||
"usage": serde_json::json!({
|
||||
"input": ev.prompt_tokens,
|
||||
"output": ev.completion_tokens,
|
||||
"total": ev.prompt_tokens + ev.completion_tokens,
|
||||
"unit": "TOKENS",
|
||||
}),
|
||||
"startTime": ev.start_time,
|
||||
"endTime": ev.end_time,
|
||||
"metadata": serde_json::json!({
|
||||
"provider": ev.provider,
|
||||
"latency_ms": ev.latency_ms,
|
||||
}),
|
||||
}),
|
||||
"level": level,
|
||||
"startTime": sp.start_time,
|
||||
"endTime": sp.end_time,
|
||||
}),
|
||||
}],
|
||||
},
|
||||
],
|
||||
};
|
||||
self.post_batch(batch).await
|
||||
}
|
||||
|
||||
async fn post_batch(&self, batch: IngestionBatch) -> Result<(), String> {
|
||||
let url = format!("{}{}", self.inner.base_url.trim_end_matches('/'), INGESTION_PATH);
|
||||
let resp = self.inner.http
|
||||
.post(url)
|
||||
@ -137,81 +146,6 @@ impl LangfuseClient {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn emit_chat_inner(&self, ev: ChatTrace) -> Result<(), String> {
|
||||
// When the caller forwarded a parent trace id (via the
|
||||
// X-Lakehouse-Trace-Id header → V1State plumbing), attach the
|
||||
// generation as a child of that trace. Without a parent we
|
||||
// mint a new top-level trace per call (Phase 40 default).
|
||||
let trace_id = ev.parent_trace_id.clone().unwrap_or_else(uuid_v7_like);
|
||||
let nested = ev.parent_trace_id.is_some();
|
||||
let gen_id = uuid_v7_like();
|
||||
let trace_ts = ev.start_time.clone();
|
||||
|
||||
let mut events = Vec::with_capacity(2);
|
||||
if !nested {
|
||||
// Only mint a fresh trace-create when we don't have a parent.
|
||||
// Reusing a parent trace id without re-creating it is the
|
||||
// contract that lets validatord's iterate-session show up
|
||||
// as one tree in Langfuse.
|
||||
events.push(IngestionEvent {
|
||||
id: uuid_v7_like(),
|
||||
timestamp: trace_ts.clone(),
|
||||
kind: "trace-create",
|
||||
body: serde_json::json!({
|
||||
"id": trace_id,
|
||||
"name": format!("v1.chat:{}", ev.provider),
|
||||
"input": serde_json::json!({
|
||||
"model": ev.model,
|
||||
"messages": ev.input,
|
||||
}),
|
||||
"metadata": serde_json::json!({
|
||||
"provider": ev.provider,
|
||||
"think": ev.think,
|
||||
}),
|
||||
}),
|
||||
});
|
||||
}
|
||||
events.push(IngestionEvent {
|
||||
id: uuid_v7_like(),
|
||||
timestamp: ev.end_time.clone(),
|
||||
kind: "generation-create",
|
||||
body: serde_json::json!({
|
||||
"id": gen_id,
|
||||
"traceId": trace_id,
|
||||
"name": "chat",
|
||||
"model": ev.model,
|
||||
"modelParameters": serde_json::json!({
|
||||
"temperature": ev.temperature,
|
||||
"max_tokens": ev.max_tokens,
|
||||
"think": ev.think,
|
||||
}),
|
||||
"input": ev.input,
|
||||
"output": ev.output,
|
||||
"usage": serde_json::json!({
|
||||
"input": ev.prompt_tokens,
|
||||
"output": ev.completion_tokens,
|
||||
"total": ev.prompt_tokens + ev.completion_tokens,
|
||||
"unit": "TOKENS",
|
||||
}),
|
||||
"startTime": ev.start_time,
|
||||
"endTime": ev.end_time,
|
||||
"metadata": serde_json::json!({
|
||||
"provider": ev.provider,
|
||||
"latency_ms": ev.latency_ms,
|
||||
}),
|
||||
}),
|
||||
});
|
||||
|
||||
self.post_batch(IngestionBatch { batch: events }).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncate a string to at most `n` chars (NOT bytes). Matches the Go
|
||||
/// `trim` helper used in session log + attempt-span emission so an
|
||||
/// operator reading two cross-runtime traces sees the same boundary.
|
||||
fn truncate(s: &str, n: usize) -> String {
|
||||
s.chars().take(n).collect()
|
||||
}
|
||||
|
||||
/// Everything the v1.chat handler collects for one completed call.
|
||||
@ -228,32 +162,6 @@ pub struct ChatTrace {
|
||||
pub start_time: String,
|
||||
pub end_time: String,
|
||||
pub latency_ms: u64,
|
||||
/// When set, attach this chat trace as a child of the named
|
||||
/// Langfuse trace instead of starting a new top-level trace. Used
|
||||
/// by `/v1/iterate` to nest its inner /v1/chat hops under the
|
||||
/// iterate-session trace so a multi-call session shows in
|
||||
/// Langfuse as ONE trace tree, not N+1 disconnected traces.
|
||||
/// Matches the Go-side `X-Lakehouse-Trace-Id` propagation
|
||||
/// (commit d6d2fdf in golangLAKEHOUSE).
|
||||
pub parent_trace_id: Option<String>,
|
||||
}
|
||||
|
||||
/// One iteration attempt inside `/v1/iterate`'s loop. Becomes one
|
||||
/// span on the parent trace when emitted via `emit_attempt_span`.
|
||||
/// Matches Go's `validator.AttemptSpan` shape so the cross-runtime
|
||||
/// observability surface is consistent.
|
||||
pub struct AttemptSpan {
|
||||
pub trace_id: String,
|
||||
pub iteration: u32,
|
||||
pub model: String,
|
||||
pub provider: String,
|
||||
pub prompt: String,
|
||||
pub raw: String,
|
||||
/// Verdict kind: "no_json" | "validation_failed" | "accepted"
|
||||
pub verdict: String,
|
||||
pub error: Option<String>,
|
||||
pub start_time: String,
|
||||
pub end_time: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
||||
@ -13,18 +13,7 @@
|
||||
|
||||
pub mod ollama;
|
||||
pub mod ollama_cloud;
|
||||
pub mod openrouter;
|
||||
pub mod gemini;
|
||||
pub mod claude;
|
||||
pub mod kimi;
|
||||
pub mod opencode;
|
||||
pub mod validate;
|
||||
pub mod iterate;
|
||||
pub mod langfuse_trace;
|
||||
pub mod session_log;
|
||||
pub mod mode;
|
||||
pub mod respond;
|
||||
pub mod truth;
|
||||
|
||||
use axum::{
|
||||
Router,
|
||||
@ -45,52 +34,10 @@ pub struct V1State {
|
||||
/// Ollama Cloud bearer token. Loaded at startup via
|
||||
/// `ollama_cloud::resolve_cloud_key()`. None = cloud routes 503.
|
||||
pub ollama_cloud_key: Option<String>,
|
||||
/// OpenRouter bearer token — free-tier rescue rung. Loaded at
|
||||
/// startup via `openrouter::resolve_openrouter_key()`. None means
|
||||
/// provider="openrouter" calls 503 rather than attempt. Same key
|
||||
/// sourcing as LLM Team UI so the two share one API quota.
|
||||
pub openrouter_key: Option<String>,
|
||||
/// Gemini API key (Google Generative Language). Loaded at startup
|
||||
/// via `gemini::resolve_gemini_key()`. None = provider="gemini"
|
||||
/// calls 503. Phase 40 deliverable.
|
||||
pub gemini_key: Option<String>,
|
||||
/// Anthropic Claude API key. Loaded at startup via
|
||||
/// `claude::resolve_claude_key()`. None = provider="claude" calls
|
||||
/// 503. Phase 40 deliverable.
|
||||
pub claude_key: Option<String>,
|
||||
/// Kimi For Coding (api.kimi.com) bearer token — direct provider
|
||||
/// for `kimi-for-coding`. Used when Ollama Cloud's `kimi-k2:1t` is
|
||||
/// upstream-broken. Loaded at startup via `kimi::resolve_kimi_key()`
|
||||
/// from `KIMI_API_KEY` env or `/etc/lakehouse/kimi.env`. None =
|
||||
/// provider="kimi" calls 503.
|
||||
pub kimi_key: Option<String>,
|
||||
/// OpenCode GO (opencode.ai) bearer token — multi-vendor curated
|
||||
/// gateway. One sk-* key reaches Claude Opus 4.7, GPT-5.5-pro,
|
||||
/// Gemini 3.1-pro, Kimi K2.6, DeepSeek, GLM, Qwen + free-tier.
|
||||
/// Loaded at startup via `opencode::resolve_opencode_key()` from
|
||||
/// `OPENCODE_API_KEY` env or `/etc/lakehouse/opencode.env`. None =
|
||||
/// provider="opencode" calls 503.
|
||||
pub opencode_key: Option<String>,
|
||||
/// Shared WorkerLookup loaded once at startup from
|
||||
/// workers_500k.parquet (path: LH_WORKERS_PARQUET env, default
|
||||
/// data/datasets/workers_500k.parquet). Used by /v1/validate to
|
||||
/// run FillValidator/EmailValidator with worker-existence checks.
|
||||
/// Falls back to an empty InMemoryWorkerLookup if the file is
|
||||
/// missing — validators still run schema/PII checks but every
|
||||
/// worker-existence check fails (Consistency error), which is
|
||||
/// the correct behavior when the roster isn't configured.
|
||||
pub validate_workers: std::sync::Arc<dyn validator::WorkerLookup>,
|
||||
/// Phase 40 early deliverable — Langfuse client. None = tracing
|
||||
/// disabled (keys missing or container unreachable). Traces are
|
||||
/// fire-and-forget: never block the response path.
|
||||
pub langfuse: Option<langfuse_trace::LangfuseClient>,
|
||||
/// Coordinator session JSONL writer (path from
|
||||
/// `[gateway].session_log_path`). One row per `/v1/iterate`
|
||||
/// session for offline DuckDB analysis. None = disabled.
|
||||
/// Cross-runtime parity with the Go-side `validatord`
|
||||
/// `[validatord].session_log_path` (commit 1a3a82a in
|
||||
/// golangLAKEHOUSE).
|
||||
pub session_log: Option<session_log::SessionLogger>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
@ -99,88 +46,25 @@ pub struct Usage {
|
||||
pub prompt_tokens: u64,
|
||||
pub completion_tokens: u64,
|
||||
pub total_tokens: u64,
|
||||
#[serde(default)]
|
||||
pub by_provider: std::collections::HashMap<String, ProviderUsage>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
pub struct ProviderUsage {
|
||||
pub requests: u64,
|
||||
pub prompt_tokens: u64,
|
||||
pub completion_tokens: u64,
|
||||
pub total_tokens: u64,
|
||||
}
|
||||
|
||||
pub fn router(state: V1State) -> Router {
|
||||
Router::new()
|
||||
.route("/chat", post(chat))
|
||||
// Canonical OpenAI path alias — lets any client built on the
|
||||
// openai SDK (pi-ai, langchain-js, etc.) treat the gateway as
|
||||
// a drop-in middleware via OPENAI_BASE_URL=http://gw/v1 alone.
|
||||
// Same handler as /chat; same OpenAI-compatible request shape.
|
||||
.route("/chat/completions", post(chat))
|
||||
.route("/respond", post(respond::respond))
|
||||
.route("/usage", get(usage))
|
||||
.route("/sessions", get(sessions))
|
||||
.route("/context", get(truth::context))
|
||||
.route("/mode", post(mode::route))
|
||||
.route("/mode/list", get(mode::list))
|
||||
.route("/mode/execute", post(mode::execute))
|
||||
.route("/validate", post(validate::validate))
|
||||
.route("/iterate", post(iterate::iterate))
|
||||
.route("/health", get(health))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
// -- Shared types (OpenAI-compatible) --
|
||||
|
||||
/// OpenAI-compatible message. `content` accepts either a plain string or
|
||||
/// an array of content parts (the modern multimodal shape:
|
||||
/// `[{type:"text", text:"..."}, {type:"image_url", ...}]`). We store as
|
||||
/// `serde_json::Value` to preserve client shape on forward; downstream
|
||||
/// providers can take it verbatim. `Message::text()` flattens for
|
||||
/// places that need a plain string (Ollama prompt assembly, char
|
||||
/// counts, the assistant's own response synthesis).
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct Message {
|
||||
pub role: String,
|
||||
pub content: serde_json::Value,
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl Message {
|
||||
/// Construct a plain text message — the common shape for callers
|
||||
/// that don't need multimodal content. Wraps the body in
|
||||
/// `serde_json::Value::String` so downstream serializers see the
|
||||
/// canonical OpenAI shape.
|
||||
pub fn new_text(role: impl Into<String>, body: impl Into<String>) -> Self {
|
||||
Self {
|
||||
role: role.into(),
|
||||
content: serde_json::Value::String(body.into()),
|
||||
}
|
||||
}
|
||||
/// Flatten content to a plain string. Strings pass through; content-
|
||||
/// part arrays concatenate the `text` fields with newlines and skip
|
||||
/// non-text parts (images etc.) — Phase 38/39 callers are text-only,
|
||||
/// real multimodal forwarding is queued.
|
||||
pub fn text(&self) -> String {
|
||||
match &self.content {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
serde_json::Value::Array(parts) => {
|
||||
let mut out = String::new();
|
||||
for p in parts {
|
||||
if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
|
||||
if !out.is_empty() { out.push('\n'); }
|
||||
out.push_str(t);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
other => other.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ChatRequest {
|
||||
pub model: String,
|
||||
pub messages: Vec<Message>,
|
||||
@ -236,140 +120,8 @@ pub struct UsageBlock {
|
||||
|
||||
// -- Handlers --
|
||||
|
||||
/// Phase 39: resolve (provider, effective_model) from a ChatRequest.
|
||||
///
|
||||
/// Explicit `req.provider` wins. If absent, infer from a model-name
|
||||
/// prefix: "openrouter/..." → openrouter (strip prefix), "cloud/..." →
|
||||
/// ollama_cloud (strip prefix). Bare names default to "ollama".
|
||||
///
|
||||
/// The stripped model is what the upstream adapter expects:
|
||||
/// OpenRouter's API wants "openai/gpt-4o-mini", not
|
||||
/// "openrouter/openai/gpt-4o-mini".
|
||||
fn resolve_provider(req: &ChatRequest) -> (String, String) {
|
||||
if let Some(p) = req.provider.as_deref() {
|
||||
return (p.to_ascii_lowercase(), req.model.clone());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("openrouter/") {
|
||||
return ("openrouter".to_string(), rest.to_string());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("cloud/") {
|
||||
return ("ollama_cloud".to_string(), rest.to_string());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("gemini/") {
|
||||
return ("gemini".to_string(), rest.to_string());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("claude/") {
|
||||
return ("claude".to_string(), rest.to_string());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("kimi/") {
|
||||
return ("kimi".to_string(), rest.to_string());
|
||||
}
|
||||
if let Some(rest) = req.model.strip_prefix("opencode/") {
|
||||
return ("opencode".to_string(), rest.to_string());
|
||||
}
|
||||
// Bare `vendor/model` shape (e.g. `x-ai/grok-4.1-fast`,
|
||||
// `moonshotai/kimi-k2`, `openai/gpt-oss-120b:free`) → OpenRouter.
|
||||
// This makes the gateway a drop-in OpenAI-compatible middleware:
|
||||
// clients using the official `openai` SDK only set OPENAI_BASE_URL
|
||||
// + a model name and get correct upstream routing without needing
|
||||
// our custom `provider` field. Ollama models in J's stack use
|
||||
// `model:tag` form with NO slash (`qwen3.5:latest`, `kimi-k2:1t`),
|
||||
// so a slash here unambiguously means "namespaced provider/model".
|
||||
if req.model.contains('/') {
|
||||
return ("openrouter".to_string(), req.model.clone());
|
||||
}
|
||||
// Vendor-bare model names (no slash, no colon) — `gpt-4o-mini`,
|
||||
// `claude-3-5-sonnet-20241022`, etc. Tools like pi-ai validate
|
||||
// models against an OpenAI-style catalog (no namespace prefix),
|
||||
// so they send the bare name. Map to OpenRouter's namespaced form
|
||||
// by inferring the vendor from the leading token. Falls through to
|
||||
// ollama if no pattern matches — preserves existing behavior.
|
||||
if !req.model.contains(':') && !req.model.contains('/') {
|
||||
let m = req.model.as_str();
|
||||
if m.starts_with("gpt-") || m.starts_with("o1-") || m.starts_with("o3-") || m.starts_with("o4-") || m == "o1" || m == "o3" || m == "o4-mini" {
|
||||
return ("openrouter".to_string(), format!("openai/{}", m));
|
||||
}
|
||||
if m.starts_with("claude-") {
|
||||
return ("openrouter".to_string(), format!("anthropic/{}", m));
|
||||
}
|
||||
if m.starts_with("grok-") {
|
||||
return ("openrouter".to_string(), format!("x-ai/{}", m));
|
||||
}
|
||||
}
|
||||
("ollama".to_string(), req.model.clone())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod resolve_provider_tests {
|
||||
use super::*;
|
||||
|
||||
fn mk_req(provider: Option<&str>, model: &str) -> ChatRequest {
|
||||
ChatRequest {
|
||||
model: model.to_string(),
|
||||
messages: vec![],
|
||||
temperature: None,
|
||||
max_tokens: None,
|
||||
stream: None,
|
||||
think: None,
|
||||
provider: provider.map(|s| s.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn explicit_provider_wins() {
|
||||
let r = mk_req(Some("openrouter"), "qwen3.5:latest");
|
||||
assert_eq!(resolve_provider(&r), ("openrouter".into(), "qwen3.5:latest".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_model_defaults_to_ollama() {
|
||||
let r = mk_req(None, "qwen3.5:latest");
|
||||
assert_eq!(resolve_provider(&r), ("ollama".into(), "qwen3.5:latest".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn openrouter_prefix_infers_and_strips() {
|
||||
let r = mk_req(None, "openrouter/openai/gpt-4o-mini");
|
||||
assert_eq!(resolve_provider(&r), ("openrouter".into(), "openai/gpt-4o-mini".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cloud_prefix_infers_and_strips() {
|
||||
let r = mk_req(None, "cloud/kimi-k2:1t");
|
||||
assert_eq!(resolve_provider(&r), ("ollama_cloud".into(), "kimi-k2:1t".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn explicit_provider_preserves_full_model_even_with_prefix() {
|
||||
// If caller provides both provider and a model with a prefix,
|
||||
// trust them — don't strip. The adapter will get the full model
|
||||
// string as-is.
|
||||
let r = mk_req(Some("openrouter"), "openrouter/openai/gpt-4o-mini");
|
||||
assert_eq!(resolve_provider(&r), ("openrouter".into(), "openrouter/openai/gpt-4o-mini".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gemini_prefix_infers_and_strips() {
|
||||
let r = mk_req(None, "gemini/gemini-2.0-flash");
|
||||
assert_eq!(resolve_provider(&r), ("gemini".into(), "gemini-2.0-flash".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn claude_prefix_infers_and_strips() {
|
||||
let r = mk_req(None, "claude/claude-3-5-sonnet-latest");
|
||||
assert_eq!(resolve_provider(&r), ("claude".into(), "claude-3-5-sonnet-latest".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kimi_prefix_infers_and_strips() {
|
||||
let r = mk_req(None, "kimi/kimi-for-coding");
|
||||
assert_eq!(resolve_provider(&r), ("kimi".into(), "kimi-for-coding".into()));
|
||||
}
|
||||
}
|
||||
|
||||
async fn chat(
|
||||
State(state): State<V1State>,
|
||||
headers: axum::http::HeaderMap,
|
||||
Json(req): Json<ChatRequest>,
|
||||
) -> Result<Json<ChatResponse>, (StatusCode, String)> {
|
||||
if req.messages.is_empty() {
|
||||
@ -379,111 +131,27 @@ async fn chat(
|
||||
tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
|
||||
}
|
||||
|
||||
// Provider resolution: explicit `req.provider` wins; otherwise
|
||||
// infer from a model-name prefix. Phase 39 PRD gate example:
|
||||
// `model: "openrouter/openai/gpt-4o-mini"` → provider "openrouter",
|
||||
// adapter gets the stripped "openai/gpt-4o-mini".
|
||||
let (provider, effective_model) = resolve_provider(&req);
|
||||
let provider = req.provider.as_deref().unwrap_or("ollama").to_ascii_lowercase();
|
||||
let start_time = chrono::Utc::now();
|
||||
let start_instant = std::time::Instant::now();
|
||||
|
||||
// If we stripped a prefix, clone req with the effective model so
|
||||
// the adapter sees what the upstream provider expects (OpenRouter
|
||||
// wants "openai/gpt-4o-mini", not "openrouter/openai/gpt-4o-mini").
|
||||
let req_for_adapter: std::borrow::Cow<'_, ChatRequest> =
|
||||
if effective_model == req.model {
|
||||
std::borrow::Cow::Borrowed(&req)
|
||||
} else {
|
||||
let mut cloned = req.clone();
|
||||
cloned.model = effective_model.clone();
|
||||
std::borrow::Cow::Owned(cloned)
|
||||
};
|
||||
|
||||
let (resp, used_provider) = match provider.as_str() {
|
||||
"ollama" | "local" | "" => {
|
||||
let r = ollama::chat(&state.ai_client, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?;
|
||||
(r, "ollama".to_string())
|
||||
}
|
||||
let resp = match provider.as_str() {
|
||||
"ollama" | "local" | "" => ollama::chat(&state.ai_client, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?,
|
||||
"ollama_cloud" | "cloud" => {
|
||||
let key = state.ollama_cloud_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"OLLAMA_CLOUD_KEY not configured".to_string(),
|
||||
"OLLAMA_CLOUD_KEY not configured — set env or populate /root/llm_team_config.json".to_string(),
|
||||
))?;
|
||||
let r = ollama_cloud::chat(key, &*req_for_adapter)
|
||||
ollama_cloud::chat(key, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?;
|
||||
(r, "ollama_cloud".to_string())
|
||||
}
|
||||
"openrouter" | "openrouter_free" => {
|
||||
// Free-tier rescue rung. Added 2026-04-24 after iter 5
|
||||
// repeated Ollama Cloud 502s on kimi-k2:1t — OpenRouter
|
||||
// gives a different provider backbone as fallback.
|
||||
let key = state.openrouter_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"OPENROUTER_API_KEY not configured".to_string(),
|
||||
))?;
|
||||
let r = openrouter::chat(key, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("openrouter: {e}")))?;
|
||||
(r, "openrouter".to_string())
|
||||
}
|
||||
"gemini" => {
|
||||
// Phase 40 provider adapter. Google Generative Language
|
||||
// API via query-string key auth (not bearer).
|
||||
let key = state.gemini_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"GEMINI_API_KEY not configured".to_string(),
|
||||
))?;
|
||||
let r = gemini::chat(key, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("gemini: {e}")))?;
|
||||
(r, "gemini".to_string())
|
||||
}
|
||||
"claude" | "anthropic" => {
|
||||
// Phase 40 provider adapter. Anthropic Messages API via
|
||||
// x-api-key header + anthropic-version:2023-06-01.
|
||||
let key = state.claude_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"ANTHROPIC_API_KEY not configured".to_string(),
|
||||
))?;
|
||||
let r = claude::chat(key, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("claude: {e}")))?;
|
||||
(r, "claude".to_string())
|
||||
}
|
||||
"kimi" => {
|
||||
// Direct Kimi For Coding provider — bypasses Ollama Cloud's
|
||||
// upstream-broken kimi-k2:1t and OpenRouter's rate-limited
|
||||
// moonshotai/kimi-k2.6. Uses sk-kimi-* keys from the Kimi
|
||||
// membership console.
|
||||
let key = state.kimi_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"KIMI_API_KEY not configured".to_string(),
|
||||
))?;
|
||||
let r = kimi::chat(key, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("kimi: {e}")))?;
|
||||
(r, "kimi".to_string())
|
||||
}
|
||||
"opencode" => {
|
||||
// OpenCode GO multi-vendor gateway — Claude Opus 4.7,
|
||||
// GPT-5.5-pro, Gemini 3.1-pro, Kimi K2.6, DeepSeek, GLM,
|
||||
// Qwen, free-tier. OpenAI-compat at opencode.ai/zen/go/v1.
|
||||
let key = state.opencode_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"OPENCODE_API_KEY not configured".to_string(),
|
||||
))?;
|
||||
let r = opencode::chat(key, &*req_for_adapter)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("opencode: {e}")))?;
|
||||
(r, "opencode".to_string())
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?
|
||||
}
|
||||
other => {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude, kimi, opencode"),
|
||||
format!("unknown provider '{other}' — supported: ollama, ollama_cloud"),
|
||||
));
|
||||
}
|
||||
};
|
||||
@ -497,21 +165,10 @@ async fn chat(
|
||||
// untouched.
|
||||
if let Some(lf) = &state.langfuse {
|
||||
let output = resp.choices.first()
|
||||
.map(|c| c.message.text())
|
||||
.map(|c| c.message.content.clone())
|
||||
.unwrap_or_default();
|
||||
// Cross-runtime trace linkage. When a caller (validatord on
|
||||
// Go side, /v1/iterate on Rust side) forwards a parent trace
|
||||
// id via X-Lakehouse-Trace-Id, attach this generation to that
|
||||
// trace so the iterate session and its inner chat hops show
|
||||
// up as ONE trace tree in Langfuse. Header name matches the
|
||||
// Go-side `shared.TraceIDHeader` constant byte-for-byte.
|
||||
let parent_trace_id = headers
|
||||
.get(crate::v1::iterate::TRACE_ID_HEADER)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_string())
|
||||
.filter(|s| !s.is_empty());
|
||||
lf.emit_chat(langfuse_trace::ChatTrace {
|
||||
provider: used_provider.clone(),
|
||||
provider: provider.clone(),
|
||||
model: resp.model.clone(),
|
||||
input: req.messages.clone(),
|
||||
output,
|
||||
@ -523,63 +180,15 @@ async fn chat(
|
||||
start_time: start_time.to_rfc3339(),
|
||||
end_time: end_time.to_rfc3339(),
|
||||
latency_ms,
|
||||
parent_trace_id,
|
||||
});
|
||||
}
|
||||
|
||||
// Phase 40 part 2 — fire-and-forget /event to observer at :3800.
|
||||
// Same ring-buffer that scrum + scenario events land in, so any
|
||||
// tool-routed-through-our-gateway (Pi, Archon, openai SDK clients)
|
||||
// shows up alongside scrum_master events for KB consolidation +
|
||||
// pathway-memory + bug-fingerprint compounding. Best-effort:
|
||||
// observer being down doesn't block the chat response.
|
||||
{
|
||||
let provider = used_provider.clone();
|
||||
let model = resp.model.clone();
|
||||
let prompt_tokens = resp.usage.prompt_tokens;
|
||||
let completion_tokens = resp.usage.completion_tokens;
|
||||
let success = true;
|
||||
tokio::spawn(async move {
|
||||
let body = serde_json::json!({
|
||||
"endpoint": "/v1/chat",
|
||||
"source": "v1.chat",
|
||||
"event_kind": "chat_completion",
|
||||
"input_summary": format!(
|
||||
"{} {} prompt={}t",
|
||||
provider, model, prompt_tokens
|
||||
),
|
||||
"output_summary": format!(
|
||||
"completion={}t {}ms",
|
||||
completion_tokens, latency_ms
|
||||
),
|
||||
"success": success,
|
||||
"duration_ms": latency_ms,
|
||||
});
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(2))
|
||||
.build()
|
||||
.unwrap_or_else(|_| reqwest::Client::new());
|
||||
let _ = client
|
||||
.post("http://localhost:3800/event")
|
||||
.json(&body)
|
||||
.send()
|
||||
.await;
|
||||
});
|
||||
}
|
||||
|
||||
// Phase 40: per-provider usage tracking
|
||||
{
|
||||
let mut u = state.usage.write().await;
|
||||
u.requests += 1;
|
||||
u.prompt_tokens += resp.usage.prompt_tokens as u64;
|
||||
u.completion_tokens += resp.usage.completion_tokens as u64;
|
||||
u.total_tokens += resp.usage.total_tokens as u64;
|
||||
|
||||
let provider_usage = u.by_provider.entry(used_provider).or_default();
|
||||
provider_usage.requests += 1;
|
||||
provider_usage.prompt_tokens += resp.usage.prompt_tokens as u64;
|
||||
provider_usage.completion_tokens += resp.usage.completion_tokens as u64;
|
||||
provider_usage.total_tokens += resp.usage.total_tokens as u64;
|
||||
}
|
||||
|
||||
Ok(Json(resp))
|
||||
@ -590,43 +199,6 @@ async fn usage(State(state): State<V1State>) -> impl IntoResponse {
|
||||
Json(snapshot)
|
||||
}
|
||||
|
||||
/// Production operational health endpoint.
|
||||
///
|
||||
/// `/v1/health` reports per-subsystem status as a JSON object so an
|
||||
/// operator (or the lakehouse-auditor service, or a load balancer)
|
||||
/// can verify the gateway is fully booted, has its provider keys
|
||||
/// loaded, the worker roster is hot, and Langfuse is reachable.
|
||||
/// Returns 200 always — fields are observed-state, not pass/fail
|
||||
/// gates. A monitoring tool should evaluate the booleans + counts
|
||||
/// against its own thresholds.
|
||||
async fn health(State(state): State<V1State>) -> impl IntoResponse {
|
||||
// Honest worker count via WorkerLookup::len. Production switchover
|
||||
// verification: after swapping workers_500k.parquet → real Chicago
|
||||
// data and restarting, this number should match the row count of
|
||||
// the new file. 0 means the file was missing / unreadable / had a
|
||||
// schema mismatch and the gateway booted with the empty fallback.
|
||||
let workers_count = state.validate_workers.len();
|
||||
let providers_configured = serde_json::json!({
|
||||
"ollama_cloud": state.ollama_cloud_key.is_some(),
|
||||
"openrouter": state.openrouter_key.is_some(),
|
||||
"kimi": state.kimi_key.is_some(),
|
||||
"opencode": state.opencode_key.is_some(),
|
||||
"gemini": state.gemini_key.is_some(),
|
||||
"claude": state.claude_key.is_some(),
|
||||
});
|
||||
let langfuse_configured = state.langfuse.is_some();
|
||||
let usage_snapshot = state.usage.read().await.clone();
|
||||
Json(serde_json::json!({
|
||||
"status": "ok",
|
||||
"workers_count": workers_count,
|
||||
"workers_loaded": workers_count > 0,
|
||||
"providers_configured": providers_configured,
|
||||
"langfuse_configured": langfuse_configured,
|
||||
"usage_total_requests": usage_snapshot.requests,
|
||||
"usage_by_provider": usage_snapshot.by_provider.keys().collect::<Vec<_>>(),
|
||||
}))
|
||||
}
|
||||
|
||||
// Phase 38 is stateless — no session persistence yet. Return an empty
|
||||
// list in OpenAI-ish shape so clients that probe this endpoint don't
|
||||
// 404. Real session state lands in Phase 41 with the profile-system
|
||||
@ -658,7 +230,7 @@ mod tests {
|
||||
assert_eq!(r.model, "qwen3.5:latest");
|
||||
assert_eq!(r.messages.len(), 2);
|
||||
assert_eq!(r.messages[0].role, "system");
|
||||
assert_eq!(r.messages[1].text(), "Hi");
|
||||
assert_eq!(r.messages[1].content, "Hi");
|
||||
assert_eq!(r.temperature, Some(0.2));
|
||||
assert_eq!(r.max_tokens, Some(100));
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -60,7 +60,10 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
|
||||
model: resp.model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message::new_text("assistant", resp.text),
|
||||
message: Message {
|
||||
role: "assistant".into(),
|
||||
content: resp.text,
|
||||
},
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
@ -86,14 +89,13 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||
let mut system = String::new();
|
||||
let mut prompt = String::new();
|
||||
for m in messages {
|
||||
let body = m.text();
|
||||
if m.role == "system" {
|
||||
if !system.is_empty() { system.push('\n'); }
|
||||
system.push_str(&body);
|
||||
system.push_str(&m.content);
|
||||
} else {
|
||||
prompt.push_str(&m.role);
|
||||
prompt.push_str(": ");
|
||||
prompt.push_str(&body);
|
||||
prompt.push_str(&m.content);
|
||||
prompt.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
@ -102,7 +104,7 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||
}
|
||||
|
||||
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
|
||||
let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
}
|
||||
|
||||
|
||||
@ -88,7 +88,7 @@ pub async fn chat(
|
||||
let text = parsed.response.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
|
||||
@ -112,7 +112,7 @@ pub async fn chat(
|
||||
model: parsed.model.unwrap_or_else(|| req.model.clone()),
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message::new_text("assistant", text),
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
|
||||
@ -1,228 +0,0 @@
|
||||
//! OpenCode GO adapter — multi-vendor curated gateway via opencode.ai/zen/go.
|
||||
//!
|
||||
//! One sk-* key reaches Claude Opus 4.7, GPT-5.5-pro, Gemini 3.1-pro,
|
||||
//! Kimi K2.6, DeepSeek, GLM, Qwen, plus 4 free-tier models.
|
||||
//! OpenAI-compatible Chat Completions; auth via Bearer.
|
||||
//!
|
||||
//! Why a separate adapter (vs reusing openrouter.rs):
|
||||
//! - Different account, different key, different base_url
|
||||
//! - No HTTP-Referer / X-Title headers (those are OpenRouter-specific)
|
||||
//! - Future-proof for any opencode-only request shaping
|
||||
//!
|
||||
//! Key sourcing priority:
|
||||
//! 1. Env var `OPENCODE_API_KEY` (loaded from /etc/lakehouse/opencode.env
|
||||
//! via systemd EnvironmentFile=)
|
||||
//! 2. /etc/lakehouse/opencode.env directly (rescue path if env missing)
|
||||
//!
|
||||
//! Resolved once at gateway startup, stored on `V1State.opencode_key`.
|
||||
//! Model-prefix routing: "opencode/<model>" auto-routes here, prefix
|
||||
//! stripped before upstream call.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
// /zen/v1 is the unified OpenCode endpoint that covers BOTH the
|
||||
// Zen pay-per-token tier (Claude/GPT/Gemini frontier) AND the Go
|
||||
// subscription tier (Kimi/GLM/DeepSeek/Qwen/Minimax/mimo). When the
|
||||
// caller has both, opencode bills per-model: Zen models charge Zen
|
||||
// balance, Go models charge against the Go subscription cap.
|
||||
//
|
||||
// /zen/go/v1 exists as a Go-only sub-path (rejects Zen models with
|
||||
// "Model not supported"); we use the unified /zen/v1 since the same
|
||||
// key works for both with correct billing routing upstream.
|
||||
const OPENCODE_BASE_URL: &str = "https://opencode.ai/zen/v1";
|
||||
// 600s default — opencode upstream models include reasoning-heavy
|
||||
// variants (Claude Opus, Kimi K2.6, GLM-5.1) that legitimately take
|
||||
// 3-5 min on big audit prompts. Override via OPENCODE_TIMEOUT_SECS.
|
||||
const OPENCODE_TIMEOUT_SECS_DEFAULT: u64 = 600;
|
||||
|
||||
fn opencode_timeout_secs() -> u64 {
|
||||
std::env::var("OPENCODE_TIMEOUT_SECS")
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok())
|
||||
.filter(|&n| n > 0)
|
||||
.unwrap_or(OPENCODE_TIMEOUT_SECS_DEFAULT)
|
||||
}
|
||||
|
||||
pub fn resolve_opencode_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("OPENCODE_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
if let Ok(raw) = std::fs::read_to_string("/etc/lakehouse/opencode.env") {
|
||||
for line in raw.lines() {
|
||||
if let Some(rest) = line.strip_prefix("OPENCODE_API_KEY=") {
|
||||
let k = rest.trim().trim_matches('"').trim_matches('\'');
|
||||
if !k.is_empty() { return Some(k.to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
// Strip the "opencode/" namespace prefix so the upstream sees the
|
||||
// bare model id (e.g. "claude-opus-4-7", "kimi-k2.6").
|
||||
let model = req.model.strip_prefix("opencode/").unwrap_or(&req.model).to_string();
|
||||
|
||||
// Anthropic models on opencode reject `temperature` with a 400
|
||||
// "temperature is deprecated for this model" error. Strip the
|
||||
// field for claude-* and the new gpt-5.x reasoning lineages
|
||||
// (Anthropic/OpenAI's reasoning models all moved away from temp).
|
||||
// Other models keep the caller's value or default to 0.3.
|
||||
let drop_temp = model.starts_with("claude-")
|
||||
|| model.starts_with("gpt-5")
|
||||
|| model.starts_with("o1")
|
||||
|| model.starts_with("o3")
|
||||
|| model.starts_with("o4");
|
||||
let body = OCChatBody {
|
||||
model: model.clone(),
|
||||
messages: req.messages.iter().map(|m| OCMessage {
|
||||
role: m.role.clone(),
|
||||
content: m.content.clone(),
|
||||
}).collect(),
|
||||
// filter(|&n| n > 0) catches Some(0) — same trap that bit the
|
||||
// Kimi adapter when callers passed empty-env-parsed-to-0.
|
||||
max_tokens: req.max_tokens.filter(|&n| n > 0).unwrap_or(800),
|
||||
temperature: if drop_temp { None } else { Some(req.temperature.unwrap_or(0.3)) },
|
||||
stream: false,
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(opencode_timeout_secs()))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(format!("{}/chat/completions", OPENCODE_BASE_URL))
|
||||
.bearer_auth(key)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("opencode.ai unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("opencode.ai {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: OCChatResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid opencode response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let choice = parsed.choices.into_iter().next()
|
||||
.ok_or_else(|| "opencode returned no choices".to_string())?;
|
||||
let text = choice.message.content;
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
|
||||
((text.chars().count() + 3) / 4) as u32
|
||||
});
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "opencode",
|
||||
model = %model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"opencode chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
|
||||
finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- OpenCode wire shapes (OpenAI-compatible) --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OCChatBody {
|
||||
model: String,
|
||||
messages: Vec<OCMessage>,
|
||||
max_tokens: u32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
temperature: Option<f64>,
|
||||
stream: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OCMessage { role: String, content: serde_json::Value }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OCChatResponse {
|
||||
choices: Vec<OCChoice>,
|
||||
#[serde(default)]
|
||||
usage: Option<OCUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OCChoice {
|
||||
message: OCMessageResp,
|
||||
#[serde(default)]
|
||||
finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OCMessageResp { content: String }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OCUsage { prompt_tokens: u32, completion_tokens: u32 }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_opencode_key_does_not_panic() {
|
||||
let _ = resolve_opencode_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_prefix_strip() {
|
||||
let cases = [
|
||||
("opencode/claude-opus-4-7", "claude-opus-4-7"),
|
||||
("opencode/kimi-k2.6", "kimi-k2.6"),
|
||||
("claude-opus-4-7", "claude-opus-4-7"),
|
||||
];
|
||||
for (input, expected) in cases {
|
||||
let out = input.strip_prefix("opencode/").unwrap_or(input);
|
||||
assert_eq!(out, expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_tokens_filters_zero() {
|
||||
// The trap: empty env -> Number("") -> 0 -> Some(0). Adapter
|
||||
// must not pass 0 upstream; should fall to 800.
|
||||
let some_zero: Option<u32> = Some(0);
|
||||
let result = some_zero.filter(|&n| n > 0).unwrap_or(800);
|
||||
assert_eq!(result, 800);
|
||||
let some_real: Option<u32> = Some(4096);
|
||||
assert_eq!(some_real.filter(|&n| n > 0).unwrap_or(800), 4096);
|
||||
let none_val: Option<u32> = None;
|
||||
assert_eq!(none_val.filter(|&n| n > 0).unwrap_or(800), 800);
|
||||
}
|
||||
}
|
||||
@ -1,220 +0,0 @@
|
||||
//! OpenRouter adapter — free-tier rescue rung for /v1/chat.
|
||||
//!
|
||||
//! Direct HTTPS call to `https://openrouter.ai/api/v1/chat/completions`
|
||||
//! with Bearer auth. Mirrors the OpenAI-compatible shape so the model
|
||||
//! list can be expanded without code changes. Added 2026-04-24 after
|
||||
//! iter 5 hit repeated Ollama Cloud 502s on kimi-k2:1t — OpenRouter
|
||||
//! free-tier models give us a different provider backbone as fallback.
|
||||
//!
|
||||
//! Key sourcing priority:
|
||||
//! 1. Env var `OPENROUTER_API_KEY`
|
||||
//! 2. `/home/profit/.env` (LLM Team convention)
|
||||
//! 3. `/root/llm_team_config.json` → providers.openrouter.api_key
|
||||
//!
|
||||
//! First hit wins. Key is resolved once at gateway startup and stored
|
||||
//! on `V1State.openrouter_key`.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
const OR_BASE_URL: &str = "https://openrouter.ai/api/v1";
|
||||
const OR_TIMEOUT_SECS: u64 = 180;
|
||||
|
||||
pub fn resolve_openrouter_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("OPENROUTER_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
// LLM Team UI writes its key to ~/.env on the host user — pick it up
|
||||
// from the same source so the free-tier rescue path works without
|
||||
// an explicit systemd Environment= line.
|
||||
for path in ["/home/profit/.env", "/root/.env"] {
|
||||
if let Ok(raw) = std::fs::read_to_string(path) {
|
||||
for line in raw.lines() {
|
||||
if let Some(rest) = line.strip_prefix("OPENROUTER_API_KEY=") {
|
||||
let k = rest.trim().trim_matches('"').trim_matches('\'');
|
||||
if !k.is_empty() { return Some(k.to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
|
||||
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
|
||||
if let Some(k) = v.pointer("/providers/openrouter/api_key").and_then(|x| x.as_str()) {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
// Strip the "openrouter/" prefix if the caller used the namespaced
|
||||
// form so OpenRouter sees the raw model id (e.g. "openai/gpt-oss-120b:free").
|
||||
let model = req.model.strip_prefix("openrouter/").unwrap_or(&req.model).to_string();
|
||||
|
||||
let body = ORChatBody {
|
||||
model: model.clone(),
|
||||
// Pass content through verbatim — preserves OpenAI's multimodal
|
||||
// content-parts shape (`[{type:"text",text:"..."}, ...]`) so the
|
||||
// upstream provider sees exactly what the client sent.
|
||||
messages: req.messages.iter().map(|m| ORMessage {
|
||||
role: m.role.clone(),
|
||||
content: m.content.clone(),
|
||||
}).collect(),
|
||||
max_tokens: req.max_tokens.unwrap_or(800),
|
||||
temperature: req.temperature.unwrap_or(0.3),
|
||||
stream: false,
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(OR_TIMEOUT_SECS))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(format!("{}/chat/completions", OR_BASE_URL))
|
||||
.bearer_auth(key)
|
||||
// OpenRouter recommends Referer + Title for attribution; absent
|
||||
// headers do not fail the call but help us see our traffic in
|
||||
// their dashboard.
|
||||
.header("HTTP-Referer", "https://vcp.devop.live")
|
||||
.header("X-Title", "Lakehouse Scrum")
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("openrouter.ai unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("openrouter.ai {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: ORChatResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid openrouter response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let choice = parsed.choices.into_iter().next()
|
||||
.ok_or_else(|| "openrouter returned no choices".to_string())?;
|
||||
let text = choice.message.content;
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
|
||||
((text.chars().count() + 3) / 4) as u32
|
||||
});
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "openrouter",
|
||||
model = %model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"openrouter chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
|
||||
finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- OpenRouter wire shapes (OpenAI-compatible) --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ORChatBody {
|
||||
model: String,
|
||||
messages: Vec<ORMessage>,
|
||||
max_tokens: u32,
|
||||
temperature: f64,
|
||||
stream: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ORMessage { role: String, content: serde_json::Value }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ORChatResponse {
|
||||
choices: Vec<ORChoice>,
|
||||
#[serde(default)]
|
||||
usage: Option<ORUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ORChoice {
|
||||
message: ORMessageResp,
|
||||
#[serde(default)]
|
||||
finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ORMessageResp { content: String }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ORUsage { prompt_tokens: u32, completion_tokens: u32 }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_openrouter_key_does_not_panic() {
|
||||
// Smoke test — all three sources may or may not be set depending
|
||||
// on environment; just confirm the call returns cleanly.
|
||||
let _ = resolve_openrouter_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_body_serializes_to_openai_shape() {
|
||||
let body = ORChatBody {
|
||||
model: "openai/gpt-oss-120b:free".into(),
|
||||
messages: vec![
|
||||
ORMessage { role: "user".into(), content: "review this".into() },
|
||||
],
|
||||
max_tokens: 800,
|
||||
temperature: 0.3,
|
||||
stream: false,
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(json.contains("\"model\":\"openai/gpt-oss-120b:free\""));
|
||||
assert!(json.contains("\"messages\""));
|
||||
assert!(json.contains("\"max_tokens\":800"));
|
||||
assert!(json.contains("\"stream\":false"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_prefix_strip_preserves_unprefixed() {
|
||||
// If caller passes "openrouter/openai/gpt-oss-120b:free" we strip.
|
||||
// If caller passes "openai/gpt-oss-120b:free" unchanged, we keep.
|
||||
let cases = [
|
||||
("openrouter/openai/gpt-oss-120b:free", "openai/gpt-oss-120b:free"),
|
||||
("openai/gpt-oss-120b:free", "openai/gpt-oss-120b:free"),
|
||||
("google/gemma-3-27b-it:free", "google/gemma-3-27b-it:free"),
|
||||
];
|
||||
for (input, expected) in cases {
|
||||
let out = input.strip_prefix("openrouter/").unwrap_or(input);
|
||||
assert_eq!(out, expected, "{input} should become {expected}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,150 +0,0 @@
|
||||
//! `/v1/respond` — the **execution** API (distinct from `/v1/chat`, the
|
||||
//! completion API).
|
||||
//!
|
||||
//! This is the consolidation move called out in the 2026-04-23 session:
|
||||
//! lift the proven pipeline from `tests/multi-agent/orchestrator.ts`
|
||||
//! (executor → reviewer → escalate → validate → seal playbook →
|
||||
//! write-through to KB) into the gateway, so the production path has
|
||||
//! the intelligence the tests already proved.
|
||||
//!
|
||||
//! `/v1/chat` stays a naive completion proxy for callers that want one.
|
||||
//! `/v1/respond` is where the loop lives. Every orchestrator-style
|
||||
//! caller migrates here and the TS harnesses become thin clients.
|
||||
//!
|
||||
//! This file holds the HTTP surface + request/response shapes. The loop
|
||||
//! itself lives in `execution_loop::ExecutionLoop`.
|
||||
|
||||
use axum::{extract::State, http::StatusCode, Json};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::V1State;
|
||||
use crate::execution_loop::{ExecutionLoop, LogEntry, RespondOutcome};
|
||||
|
||||
/// A structured task — mirrors `TaskSpec` in `tests/multi-agent/agent.ts`.
|
||||
/// Kept deliberately open so non-staffing task classes (code-gen,
|
||||
/// DevOps-long-horizon) can land without a schema fight.
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
pub struct RespondRequest {
|
||||
/// Task class — routes to the right truth rules + validator. For the
|
||||
/// staffing substrate: `staffing.fill`, `staffing.rescue`,
|
||||
/// `staffing.sms_draft`. Truth-layer lookup is a no-op until a rule
|
||||
/// set is registered for the class.
|
||||
pub task_class: String,
|
||||
|
||||
/// Human-readable operation description — becomes the playbook
|
||||
/// `operation` field on seal, and the primary signal for
|
||||
/// playbook_memory embedding.
|
||||
pub operation: String,
|
||||
|
||||
/// Free-form structured context. Passed to the executor prompt and
|
||||
/// to the playbook seeder. Staffing tasks expect
|
||||
/// `{target_role, target_count, target_city, target_state, approach_hint}`
|
||||
/// but nothing here validates that — the validator crate will (Phase 43).
|
||||
#[serde(default)]
|
||||
pub spec: serde_json::Value,
|
||||
|
||||
/// Executor model. Defaults to the hot-path local model if omitted.
|
||||
/// See orchestrator.ts:28 (`EXECUTOR_MODEL = "qwen3.5:latest"`).
|
||||
#[serde(default)]
|
||||
pub executor_model: Option<String>,
|
||||
|
||||
/// Reviewer model. Defaults to the hot-path local reviewer.
|
||||
/// See orchestrator.ts:29 (`REVIEWER_MODEL = "qwen3:latest"`).
|
||||
#[serde(default)]
|
||||
pub reviewer_model: Option<String>,
|
||||
|
||||
/// Hard cap on executor turns. Default matches orchestrator.ts:30
|
||||
/// (`MAX_TURNS = 12`). Cloud escalation counts as a turn.
|
||||
#[serde(default)]
|
||||
pub max_turns: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RespondResponse {
|
||||
/// `ok` = consensus reached, playbook sealed. `failed` = loop ran
|
||||
/// out of turns or hit the drift cap. `blocked` = truth-layer
|
||||
/// veto (Phase 42 rule citation in `error`).
|
||||
pub status: &'static str,
|
||||
|
||||
/// The final artifact — for staffing fills, `{fills: [{candidate_id, name}]}`.
|
||||
/// Empty on failure / block.
|
||||
pub artifact: serde_json::Value,
|
||||
|
||||
/// Structured cross-turn log. Same shape as orchestrator.ts LogEntry
|
||||
/// so existing tooling (kb extractors, fact_extractor.ts) reads it
|
||||
/// without change.
|
||||
pub log: Vec<LogEntry>,
|
||||
|
||||
/// Iteration count actually used. ≤ max_turns. Stamped on
|
||||
/// outcomes.jsonl per the indicator audit (2026-04-23).
|
||||
pub iterations: u32,
|
||||
|
||||
/// Error message on non-ok status. Truth-rule citations land here
|
||||
/// when `status == "blocked"`.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
pub async fn respond(
|
||||
State(state): State<V1State>,
|
||||
Json(req): Json<RespondRequest>,
|
||||
) -> Result<Json<RespondResponse>, (StatusCode, String)> {
|
||||
if req.operation.is_empty() {
|
||||
return Err((StatusCode::BAD_REQUEST, "operation must be non-empty".into()));
|
||||
}
|
||||
if req.task_class.is_empty() {
|
||||
return Err((StatusCode::BAD_REQUEST, "task_class must be non-empty".into()));
|
||||
}
|
||||
|
||||
let mut loop_runner = ExecutionLoop::new(state, req);
|
||||
let outcome = loop_runner.run().await.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, format!("execution loop: {e}"))
|
||||
})?;
|
||||
|
||||
let (status, error) = match &outcome {
|
||||
RespondOutcome::Ok { .. } => ("ok", None),
|
||||
RespondOutcome::Failed { reason, .. } => ("failed", Some(reason.clone())),
|
||||
RespondOutcome::Blocked { reason, .. } => ("blocked", Some(reason.clone())),
|
||||
};
|
||||
|
||||
Ok(Json(RespondResponse {
|
||||
status,
|
||||
artifact: outcome.artifact(),
|
||||
log: outcome.into_log(),
|
||||
iterations: loop_runner.turns_used(),
|
||||
error,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn respond_request_parses_minimal() {
|
||||
let raw = r#"{
|
||||
"task_class": "staffing.fill",
|
||||
"operation": "fill: Welder x2 in Toledo, OH"
|
||||
}"#;
|
||||
let r: RespondRequest = serde_json::from_str(raw).unwrap();
|
||||
assert_eq!(r.task_class, "staffing.fill");
|
||||
assert_eq!(r.executor_model, None);
|
||||
assert_eq!(r.max_turns, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn respond_request_parses_full() {
|
||||
let raw = r#"{
|
||||
"task_class": "staffing.fill",
|
||||
"operation": "fill: Welder x2 in Toledo, OH",
|
||||
"spec": {"target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH"},
|
||||
"executor_model": "qwen3.5:latest",
|
||||
"reviewer_model": "qwen3:latest",
|
||||
"max_turns": 12
|
||||
}"#;
|
||||
let r: RespondRequest = serde_json::from_str(raw).unwrap();
|
||||
assert_eq!(r.executor_model.as_deref(), Some("qwen3.5:latest"));
|
||||
assert_eq!(r.max_turns, Some(12));
|
||||
assert_eq!(r.spec["target_count"], 2);
|
||||
}
|
||||
}
|
||||
@ -1,235 +0,0 @@
|
||||
//! Coordinator session JSONL writer — Rust parity with the Go-side
|
||||
//! `internal/validator/session_log.go` (commit 1a3a82a in
|
||||
//! golangLAKEHOUSE). Same schema, same field names, same producer
|
||||
//! semantics, so a unified longitudinal log can pull from either
|
||||
//! runtime via DuckDB.
|
||||
//!
|
||||
//! Schema: `session.iterate.v1`. One row per `/v1/iterate` session.
|
||||
//! Append-only. Best-effort posture: errors warn and the iterate
|
||||
//! response always ships.
|
||||
//!
|
||||
//! See `golangLAKEHOUSE/docs/SESSION_LOG.md` for the full schema
|
||||
//! reference + DuckDB query examples. This module produces rows
|
||||
//! with `daemon: "gateway"`; the Go side produces `daemon:
|
||||
//! "validatord"`. Operators who want a unified stream can point both
|
||||
//! to the same path (the OS write-append is atomic for the row sizes
|
||||
//! we produce) or query both files together via duckdb's `read_json`
|
||||
//! glob support.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
pub const SESSION_RECORD_SCHEMA: &str = "session.iterate.v1";
|
||||
|
||||
/// One row in coordinator_sessions.jsonl. Field names are the on-wire
|
||||
/// names — must stay byte-equal to the Go side's
|
||||
/// `validator.SessionRecord` (proven by the cross-runtime parity
|
||||
/// probe at golangLAKEHOUSE/scripts/cutover/parity/).
|
||||
// Deserialize is supported so the parity helper binary can round-trip
|
||||
// fixture inputs through serde without hand-rolling a parser. Production
|
||||
// emit path uses Serialize only; SessionRecord rows are written by the
|
||||
// gateway and consumed by DuckDB / external tooling, never re-read by us.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SessionRecord {
|
||||
pub schema: String,
|
||||
pub session_id: String,
|
||||
pub timestamp: String,
|
||||
pub daemon: String,
|
||||
pub kind: String,
|
||||
pub model: String,
|
||||
pub provider: String,
|
||||
pub prompt: String,
|
||||
pub iterations: u32,
|
||||
pub max_iterations: u32,
|
||||
pub final_verdict: String, // "accepted" | "max_iter_exhausted" | "infra_error"
|
||||
pub attempts: Vec<SessionAttemptRecord>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub artifact: Option<serde_json::Value>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub grounded_in_roster: Option<bool>,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SessionAttemptRecord {
|
||||
pub iteration: u32,
|
||||
pub verdict_kind: String, // "no_json" | "validation_failed" | "accepted" | "infra_error"
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub span_id: Option<String>,
|
||||
}
|
||||
|
||||
/// Append-only writer. Cloneable handle — internal state is Arc'd so
|
||||
/// V1State can keep its own clone and per-request clones are cheap.
|
||||
#[derive(Clone)]
|
||||
pub struct SessionLogger {
|
||||
inner: Arc<Inner>,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
path: String,
|
||||
/// tokio::Mutex (not std) because we hold it across the async
|
||||
/// fs write. Contention is low (one row per /v1/iterate session).
|
||||
mu: Mutex<()>,
|
||||
}
|
||||
|
||||
impl SessionLogger {
|
||||
/// Construct a logger writing to `path`. Empty path → None
|
||||
/// (skip the wiring in the iterate handler entirely).
|
||||
pub fn from_path(path: &str) -> Option<Self> {
|
||||
if path.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(Self {
|
||||
inner: Arc::new(Inner {
|
||||
path: path.to_string(),
|
||||
mu: Mutex::new(()),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
/// Append one record. Best-effort: failures land in `tracing::warn!`
|
||||
/// and the caller sees Ok(()) — observability is a witness, never
|
||||
/// a gate. Returns Err only on impossible cases the type system
|
||||
/// can't rule out (here: serde_json::to_string failing on a
|
||||
/// well-formed struct, which shouldn't happen).
|
||||
pub async fn append(&self, rec: SessionRecord) {
|
||||
let body = match serde_json::to_string(&rec) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::warn!(target: "v1.session_log", "marshal: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let _guard = self.inner.mu.lock().await;
|
||||
if let Err(e) = self.write(&body).await {
|
||||
tracing::warn!(target: "v1.session_log", "write {}: {e}", self.inner.path);
|
||||
}
|
||||
}
|
||||
|
||||
async fn write(&self, body: &str) -> std::io::Result<()> {
|
||||
use tokio::fs::OpenOptions;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
// Lazy mkdir on first write so a not-yet-mounted volume at
|
||||
// startup doesn't kill the daemon.
|
||||
if let Some(parent) = std::path::Path::new(&self.inner.path).parent() {
|
||||
if !parent.as_os_str().is_empty() {
|
||||
tokio::fs::create_dir_all(parent).await?;
|
||||
}
|
||||
}
|
||||
let mut f = OpenOptions::new()
|
||||
.append(true)
|
||||
.create(true)
|
||||
.open(&self.inner.path)
|
||||
.await?;
|
||||
f.write_all(body.as_bytes()).await?;
|
||||
f.write_all(b"\n").await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Best-effort UTF-8 char truncation. Matches Go's `trim` helper so
|
||||
/// rows produced by either runtime cap fields at the same boundaries.
|
||||
pub fn truncate(s: &str, n: usize) -> String {
|
||||
s.chars().take(n).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tokio::fs;
|
||||
|
||||
fn fixture_record(session_id: &str) -> SessionRecord {
|
||||
SessionRecord {
|
||||
schema: SESSION_RECORD_SCHEMA.to_string(),
|
||||
session_id: session_id.to_string(),
|
||||
timestamp: "2026-05-02T08:00:00Z".to_string(),
|
||||
daemon: "gateway".to_string(),
|
||||
kind: "fill".to_string(),
|
||||
model: "qwen3.5:latest".to_string(),
|
||||
provider: "ollama".to_string(),
|
||||
prompt: "produce a fill artifact".to_string(),
|
||||
iterations: 1,
|
||||
max_iterations: 3,
|
||||
final_verdict: "accepted".to_string(),
|
||||
attempts: vec![SessionAttemptRecord {
|
||||
iteration: 0,
|
||||
verdict_kind: "accepted".to_string(),
|
||||
error: None,
|
||||
span_id: Some("span-0".to_string()),
|
||||
}],
|
||||
artifact: Some(serde_json::json!({"fills":[{"candidate_id":"W-1"}]})),
|
||||
grounded_in_roster: Some(true),
|
||||
duration_ms: 50,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn from_path_empty_returns_none() {
|
||||
assert!(SessionLogger::from_path("").is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn append_writes_jsonl_row_with_schema_field() {
|
||||
let dir = tempdir();
|
||||
let path = dir.join("sessions.jsonl");
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let logger = SessionLogger::from_path(&path_str).unwrap();
|
||||
logger.append(fixture_record("trace-a")).await;
|
||||
|
||||
let body = fs::read_to_string(&path).await.unwrap();
|
||||
assert!(body.contains("\"schema\":\"session.iterate.v1\""));
|
||||
assert!(body.contains("\"session_id\":\"trace-a\""));
|
||||
assert!(body.contains("\"grounded_in_roster\":true"));
|
||||
assert!(body.ends_with('\n'));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn append_concurrent_safe() {
|
||||
let dir = tempdir();
|
||||
let path = dir.join("sessions.jsonl");
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let logger = SessionLogger::from_path(&path_str).unwrap();
|
||||
|
||||
let n = 32;
|
||||
let mut handles = Vec::with_capacity(n);
|
||||
for i in 0..n {
|
||||
let l = logger.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
l.append(fixture_record(&format!("trace-{i}"))).await;
|
||||
}));
|
||||
}
|
||||
for h in handles {
|
||||
h.await.unwrap();
|
||||
}
|
||||
|
||||
let body = fs::read_to_string(&path).await.unwrap();
|
||||
let lines: Vec<_> = body.lines().filter(|l| !l.is_empty()).collect();
|
||||
assert_eq!(lines.len(), n, "expected {n} rows, got {}", lines.len());
|
||||
// Every row must round-trip through serde — a torn write
|
||||
// would surface as a parse error.
|
||||
for line in lines {
|
||||
let _: serde_json::Value = serde_json::from_str(line).expect("valid json per row");
|
||||
}
|
||||
}
|
||||
|
||||
fn tempdir() -> PathBuf {
|
||||
// Per-test unique path so prior runs don't pollute the next.
|
||||
// The static counter increments across the whole test binary,
|
||||
// so back-to-back tests in the same module get distinct dirs.
|
||||
static COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
|
||||
let n = COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let p = std::env::temp_dir().join(format!(
|
||||
"session_log_test_{}_{}_{}",
|
||||
std::process::id(),
|
||||
chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0),
|
||||
n,
|
||||
));
|
||||
std::fs::create_dir_all(&p).unwrap();
|
||||
p
|
||||
}
|
||||
}
|
||||
@ -1,47 +0,0 @@
|
||||
use serde::Serialize;
|
||||
use truth::default_truth_store;
|
||||
|
||||
// Note: truth_router() was a stub wrapper around a single /context route
|
||||
// that nothing called — v1/mod.rs wires get(truth::context) directly
|
||||
// onto its own router. Removed 2026-04-24 along with its #[allow(dead_code)]
|
||||
// attribute; the handler below is the real surface.
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct ContextResponse {
|
||||
pub task_classes: Vec<String>,
|
||||
pub rules: Vec<RuleInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RuleInfo {
|
||||
pub id: String,
|
||||
pub task_class: String,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
pub async fn context() -> axum::Json<ContextResponse> {
|
||||
let store = default_truth_store();
|
||||
|
||||
let task_classes: Vec<String> = vec![
|
||||
"staffing.fill".to_string(),
|
||||
"staffing.rescue".to_string(),
|
||||
"staffing.sms_draft".to_string(),
|
||||
"staffing.any".to_string(),
|
||||
];
|
||||
|
||||
let mut rules = Vec::new();
|
||||
for tc in &task_classes {
|
||||
for rule in store.get_rules(tc) {
|
||||
rules.push(RuleInfo {
|
||||
id: rule.id.clone(),
|
||||
task_class: rule.task_class.clone(),
|
||||
description: rule.description.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
axum::Json(ContextResponse {
|
||||
task_classes,
|
||||
rules,
|
||||
})
|
||||
}
|
||||
@ -1,82 +0,0 @@
|
||||
//! /v1/validate — gateway-side artifact validation endpoint.
|
||||
//!
|
||||
//! Phase 43 v3 part 2: makes the validator crate network-callable.
|
||||
//! Any caller (scrum loop, test harness, future agent) can POST a
|
||||
//! generated artifact and get back a Report (success) or
|
||||
//! ValidationError (failure with structured field/reason).
|
||||
//!
|
||||
//! Request shape:
|
||||
//! POST /v1/validate
|
||||
//! {
|
||||
//! "kind": "fill" | "email" | "playbook",
|
||||
//! "artifact": { ... },
|
||||
//! "context": { ... } // optional — folded into artifact._context
|
||||
//! }
|
||||
//!
|
||||
//! Response on success: 200 + Report JSON
|
||||
//! Response on failure: 422 + ValidationError JSON
|
||||
//! Response on bad request: 400 + plain-text error
|
||||
//!
|
||||
//! The shared WorkerLookup is loaded once at gateway startup from
|
||||
//! workers_500k.parquet (path configurable via LH_WORKERS_PARQUET
|
||||
//! env, defaults to data/datasets/workers_500k.parquet). Falls back
|
||||
//! to an empty InMemoryWorkerLookup if the file is missing — the
|
||||
//! validators will still run schema/length/PII checks but worker-
|
||||
//! existence checks will all fail (Consistency error), which is the
|
||||
//! correct behavior when the roster isn't configured.
|
||||
|
||||
use axum::{extract::State, http::StatusCode, response::IntoResponse, Json};
|
||||
use serde::Deserialize;
|
||||
use validator::{
|
||||
Artifact, Validator, ValidationError,
|
||||
staffing::{
|
||||
fill::FillValidator,
|
||||
email::EmailValidator,
|
||||
playbook::PlaybookValidator,
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct ValidateRequest {
|
||||
/// `"fill" | "email" | "playbook"` — picks which validator runs.
|
||||
pub kind: String,
|
||||
/// The artifact JSON (free-form; shape depends on `kind`).
|
||||
pub artifact: serde_json::Value,
|
||||
/// Optional context bag — merged into `artifact._context` so the
|
||||
/// validator can read fields like `target_count`, `city`,
|
||||
/// `client_id`, `candidate_id` without callers having to embed
|
||||
/// `_context` in the artifact themselves.
|
||||
#[serde(default)]
|
||||
pub context: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
pub async fn validate(
|
||||
State(state): State<super::V1State>,
|
||||
Json(req): Json<ValidateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
// Merge context into artifact under `_context` so validators can
|
||||
// pull contract metadata uniformly.
|
||||
let mut artifact_value = req.artifact;
|
||||
if let Some(ctx) = req.context {
|
||||
if let Some(obj) = artifact_value.as_object_mut() {
|
||||
obj.insert("_context".to_string(), ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// Dispatch.
|
||||
let workers = state.validate_workers.clone();
|
||||
let result: Result<validator::Report, ValidationError> = match req.kind.as_str() {
|
||||
"fill" => FillValidator::new(workers).validate(&Artifact::FillProposal(artifact_value)),
|
||||
"email" => EmailValidator::new(workers).validate(&Artifact::EmailDraft(artifact_value)),
|
||||
"playbook" => PlaybookValidator.validate(&Artifact::Playbook(artifact_value)),
|
||||
other => return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!("unknown kind '{other}' — expected fill | email | playbook"),
|
||||
).into_response(),
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(report) => (StatusCode::OK, Json(report)).into_response(),
|
||||
Err(e) => (StatusCode::UNPROCESSABLE_ENTITY, Json(e)).into_response(),
|
||||
}
|
||||
}
|
||||
@ -8,7 +8,6 @@ shared = { path = "../shared" }
|
||||
storaged = { path = "../storaged" }
|
||||
catalogd = { path = "../catalogd" }
|
||||
vectord = { path = "../vectord" }
|
||||
journald = { path = "../journald" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true, features = ["multipart"] }
|
||||
lopdf = { workspace = true }
|
||||
|
||||
@ -2,9 +2,10 @@
|
||||
/// When a source changes format (columns renamed, added, removed, type changed),
|
||||
/// the system detects the diff and can auto-map using AI or heuristic matching.
|
||||
|
||||
use arrow::datatypes::{DataType, SchemaRef};
|
||||
use arrow::datatypes::{DataType, Schema, SchemaRef};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// A detected change between two schema versions.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -222,8 +223,7 @@ fn find_similar_column<'a>(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arrow::datatypes::{Field, Schema};
|
||||
use std::sync::Arc;
|
||||
use arrow::datatypes::Field;
|
||||
|
||||
fn schema(fields: Vec<(&str, DataType)>) -> SchemaRef {
|
||||
Arc::new(Schema::new(
|
||||
|
||||
@ -3,7 +3,7 @@ use axum::{
|
||||
extract::{Multipart, Path, Query, State},
|
||||
http::{HeaderMap, StatusCode},
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
routing::{delete, get, patch, post},
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use object_store::ObjectStore;
|
||||
@ -33,11 +33,6 @@ pub struct IngestState {
|
||||
/// Scheduled-ingest registry. The scheduler task runs against this
|
||||
/// store; HTTP CRUD endpoints write through it.
|
||||
pub schedules: schedule::ScheduleStore,
|
||||
/// Event journal for ADR-012 mutation history. Optional for back-compat
|
||||
/// with callers (like scheduled ingest tests) that don't wire it yet.
|
||||
/// When present, successful ingests emit a record_ingest event — closes
|
||||
/// P9-001 on the file-upload path. (2026-04-23)
|
||||
pub journal: Option<journald::journal::Journal>,
|
||||
}
|
||||
|
||||
/// Push `DatasetAppended` triggers for every HNSW index bound to this
|
||||
@ -141,22 +136,6 @@ async fn ingest_file(
|
||||
Ok(result) => {
|
||||
if !result.deduplicated {
|
||||
notify_agent_on_append(&state, &result.dataset_name).await;
|
||||
// P9-001 fix (2026-04-23): emit a mutation event on every
|
||||
// non-deduplicated ingest. Dedup no-ops don't need events
|
||||
// (ADR-020 register() is already idempotent on same fingerprint).
|
||||
if let Some(ref journal) = state.journal {
|
||||
if let Err(e) = journal.record_ingest(
|
||||
&result.dataset_name,
|
||||
result.rows as usize,
|
||||
"ingest_api",
|
||||
&filename,
|
||||
).await {
|
||||
tracing::warn!(
|
||||
"journal record_ingest failed for '{}': {}",
|
||||
result.dataset_name, e,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if result.deduplicated {
|
||||
Ok((StatusCode::OK, Json(result)))
|
||||
@ -651,108 +630,3 @@ async fn run_schedule_now(
|
||||
}
|
||||
Ok(Json(outcome))
|
||||
}
|
||||
|
||||
// ─── Tests ───
|
||||
|
||||
#[cfg(test)]
|
||||
mod journal_integration_tests {
|
||||
//! P9-001 integration test: prove that a successful ingest produces a
|
||||
//! journal.record_ingest event. Block 2 on PR #10 was "journal event
|
||||
//! verified live" being unbacked by the diff. This test makes the
|
||||
//! verification committed and reproducible.
|
||||
|
||||
use journald::journal::{Event, Journal};
|
||||
use object_store::memory::InMemory;
|
||||
use std::sync::Arc;
|
||||
|
||||
// Helper: build a bare Journal against an in-memory object store.
|
||||
// Flush threshold 1 so every recorded event is persisted immediately.
|
||||
fn test_journal() -> Journal {
|
||||
let store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
|
||||
Journal::new(store, 1)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn journal_record_ingest_increments_counter() {
|
||||
// Arrange — fresh journal, counter starts at zero.
|
||||
let journal = test_journal();
|
||||
let stats0 = journal.stats().await;
|
||||
assert_eq!(stats0.total_events_created, 0);
|
||||
assert_eq!(stats0.buffer_events, 0);
|
||||
|
||||
// Act — simulate what the /ingest/file success path does.
|
||||
journal
|
||||
.record_ingest("test_dataset", 42, "ingest_api", "probe.csv")
|
||||
.await
|
||||
.expect("record_ingest should succeed");
|
||||
|
||||
// Assert — counter advanced, event exists. With threshold=1 the
|
||||
// event flushed to store; with threshold>N it would be in-buffer.
|
||||
let stats1 = journal.stats().await;
|
||||
assert_eq!(stats1.total_events_created, 1, "counter should reflect one recorded event");
|
||||
|
||||
// Assert — the event is retrievable by entity.
|
||||
let history = journal
|
||||
.get_entity_history("batch:42")
|
||||
.await
|
||||
.expect("history lookup");
|
||||
assert_eq!(history.len(), 1, "one event should be visible in history");
|
||||
let ev = &history[0];
|
||||
assert_eq!(ev.action, "ingest");
|
||||
assert_eq!(ev.entity_type, "test_dataset");
|
||||
assert_eq!(ev.actor, "ingest_api");
|
||||
assert!(
|
||||
ev.new_value.contains("probe.csv"),
|
||||
"new_value should carry source filename, got: {}",
|
||||
ev.new_value
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn optional_journal_field_none_is_valid_back_compat() {
|
||||
// IngestState.journal is Option<Journal>. Back-compat path: when
|
||||
// the field is None, the ingest handler MUST still succeed — the
|
||||
// journal call is fire-and-forget, never load-bearing.
|
||||
//
|
||||
// This test asserts the type shape: Option<Journal> is what we
|
||||
// expect. If a refactor makes it mandatory, this test forces an
|
||||
// explicit re-consideration.
|
||||
let none_journal: Option<Journal> = None;
|
||||
assert!(none_journal.is_none());
|
||||
|
||||
let some_journal: Option<Journal> = Some(test_journal());
|
||||
assert!(some_journal.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn journal_record_event_fields_match_adr_012_schema() {
|
||||
// ADR-012 locks the event schema: entity_type, entity_id, field,
|
||||
// action, old_value, new_value, actor, source, workspace_id plus
|
||||
// the auto-assigned event_id + timestamp. This test pins the
|
||||
// field names so a future refactor can't silently drop one.
|
||||
let journal = test_journal();
|
||||
let base = Event {
|
||||
event_id: String::new(),
|
||||
timestamp: chrono::Utc::now(),
|
||||
entity_type: "candidate".into(),
|
||||
entity_id: "CAND-0001".into(),
|
||||
field: "phone".into(),
|
||||
action: "update".into(),
|
||||
old_value: "555-0000".into(),
|
||||
new_value: "555-9999".into(),
|
||||
actor: "recruiter".into(),
|
||||
source: "api".into(),
|
||||
workspace_id: "ws-x".into(),
|
||||
};
|
||||
journal.record(base).await.expect("record should accept full-schema event");
|
||||
let h = journal
|
||||
.get_entity_history("CAND-0001")
|
||||
.await
|
||||
.expect("lookup");
|
||||
assert_eq!(h.len(), 1);
|
||||
assert_eq!(h[0].field, "phone");
|
||||
assert_eq!(h[0].old_value, "555-0000");
|
||||
assert_eq!(h[0].new_value, "555-9999");
|
||||
assert_eq!(h[0].workspace_id, "ws-x");
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,14 +72,12 @@ async fn process_inbox(
|
||||
|
||||
let path = entry.path();
|
||||
|
||||
// Skip directories and hidden files. Bind filename once via
|
||||
// let-else so the subsequent use is unwrap-free — previous
|
||||
// version relied on a map_or guard above + an .unwrap() here
|
||||
// being consistent, which is a fragile invariant.
|
||||
if path.is_dir() { continue; }
|
||||
let Some(fn_os) = path.file_name() else { continue; };
|
||||
let filename = fn_os.to_string_lossy().to_string();
|
||||
if filename.starts_with('.') { continue; }
|
||||
// Skip directories and hidden files
|
||||
if path.is_dir() || path.file_name().map_or(true, |n| n.to_string_lossy().starts_with('.')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_string_lossy().to_string();
|
||||
tracing::info!("watcher: found new file '{}'", filename);
|
||||
|
||||
// Read file
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
/// Storage: events buffer in memory, flush to Parquet periodically.
|
||||
/// Query: load Parquet files, filter by entity/field/actor/time.
|
||||
|
||||
use arrow::array::{ArrayRef, RecordBatch, StringArray};
|
||||
use arrow::array::{ArrayRef, RecordBatch, StringArray, UInt64Array};
|
||||
use arrow::datatypes::{DataType, Field, Schema};
|
||||
use chrono::{DateTime, Utc};
|
||||
use object_store::ObjectStore;
|
||||
|
||||
@ -456,26 +456,6 @@ async fn build_lance_vector_index(path: &str, _dims: usize) -> Result<()> {
|
||||
.await
|
||||
.context("create_index")?;
|
||||
|
||||
// Also build the scalar btree on doc_id. This bench's
|
||||
// measure_random_access_lance uses take(row_position) which doesn't
|
||||
// need the btree, but the dataset this bench writes is also queried
|
||||
// downstream by /vectors/lance/doc/<name>/<doc_id> (the production
|
||||
// lookup path) — without this index that path falls back to a full
|
||||
// table scan. Cheap to build (~1.2s on 10M rows) and matches the
|
||||
// gateway's lance_migrate handler behavior so bench-produced datasets
|
||||
// are immediately production-shape.
|
||||
use lance_index::scalar::ScalarIndexParams;
|
||||
dataset
|
||||
.create_index(
|
||||
&["doc_id"],
|
||||
IndexType::Scalar,
|
||||
Some("doc_id_btree".into()),
|
||||
&ScalarIndexParams::default(),
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.context("create_index doc_id btree")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@ -7,7 +7,6 @@ edition = "2024"
|
||||
shared = { path = "../shared" }
|
||||
catalogd = { path = "../catalogd" }
|
||||
storaged = { path = "../storaged" }
|
||||
truth = { path = "../truth" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
||||
@ -84,17 +84,14 @@ pub async fn compact(
|
||||
// Load deltas
|
||||
let delta_batches = load_deltas(store, dataset_name).await?;
|
||||
let delta_count = delta_batches.len();
|
||||
// Row counts captured before extend; previously base_rows subtracted delta_count (files) from rows — unit mismatch.
|
||||
let base_row_count: usize = base_batches.iter().map(|b| b.num_rows()).sum();
|
||||
let delta_row_count: usize = delta_batches.iter().map(|b| b.num_rows()).sum();
|
||||
|
||||
let has_tombstones = !tombstones.is_empty();
|
||||
let nothing_to_do = delta_batches.is_empty() && !has_tombstones;
|
||||
if nothing_to_do {
|
||||
return Ok(CompactResult {
|
||||
base_rows: base_row_count,
|
||||
base_rows: base_batches.iter().map(|b| b.num_rows()).sum(),
|
||||
delta_rows: 0,
|
||||
final_rows: base_row_count,
|
||||
final_rows: base_batches.iter().map(|b| b.num_rows()).sum(),
|
||||
deltas_merged: 0,
|
||||
tombstones_applied: 0,
|
||||
rows_dropped_by_tombstones: 0,
|
||||
@ -102,7 +99,7 @@ pub async fn compact(
|
||||
}
|
||||
|
||||
base_batches.extend(delta_batches);
|
||||
let pre_filter_rows: usize = base_row_count + delta_row_count;
|
||||
let pre_filter_rows: usize = base_batches.iter().map(|b| b.num_rows()).sum();
|
||||
|
||||
// If primary key specified, deduplicate (keep last occurrence)
|
||||
let merged_batches = if let Some(_pk) = primary_key_col {
|
||||
@ -186,8 +183,8 @@ pub async fn compact(
|
||||
);
|
||||
|
||||
Ok(CompactResult {
|
||||
base_rows: base_row_count,
|
||||
delta_rows: delta_row_count,
|
||||
base_rows: pre_filter_rows - delta_count, // rough base-before-deltas
|
||||
delta_rows: delta_count,
|
||||
final_rows,
|
||||
deltas_merged: delta_count,
|
||||
tombstones_applied: tombstones.len(),
|
||||
|
||||
@ -9,9 +9,7 @@ use axum::{
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::sync::Arc;
|
||||
use truth::{RuleAction, TruthStore};
|
||||
|
||||
use crate::cache::CacheStats;
|
||||
use crate::context::QueryEngine;
|
||||
use crate::delta;
|
||||
use crate::paged::ResultStore;
|
||||
@ -20,26 +18,12 @@ use crate::paged::ResultStore;
|
||||
pub struct QueryState {
|
||||
pub engine: QueryEngine,
|
||||
pub result_store: ResultStore,
|
||||
// Policy gate for incoming SQL. Every /sql and /paged request is
|
||||
// evaluated against this store before hitting DataFusion. Added for
|
||||
// P42-002 ("raw SQL forwarded without schema or policy gate") after
|
||||
// the scrum master's queryd/service.rs finding looped across iters
|
||||
// 3-5 without ever being reachable by the 6-line auto-applier.
|
||||
pub truth: Arc<TruthStore>,
|
||||
}
|
||||
|
||||
pub fn router(engine: QueryEngine) -> Router {
|
||||
router_with_truth(engine, Arc::new(truth::sql_query_guard_store()))
|
||||
}
|
||||
|
||||
/// Test/integration hook: construct the router with a caller-supplied
|
||||
/// TruthStore so tests can assert reject/pass behavior deterministically
|
||||
/// without depending on the default needle list.
|
||||
pub fn router_with_truth(engine: QueryEngine, truth: Arc<TruthStore>) -> Router {
|
||||
let state = QueryState {
|
||||
engine: engine.clone(),
|
||||
result_store: ResultStore::new(100, 50), // 100 rows/page, keep 50 results
|
||||
truth,
|
||||
};
|
||||
Router::new()
|
||||
.route("/health", get(health))
|
||||
@ -69,11 +53,6 @@ struct QueryResponse {
|
||||
columns: Vec<ColumnInfo>,
|
||||
rows: serde_json::Value,
|
||||
row_count: usize,
|
||||
// Elapsed wall time from handler entry to response. Required for
|
||||
// audit-log parity — gateway's audit row previously stored null here.
|
||||
// Scrum iter 9 finding, populated from std::time::Instant captured
|
||||
// at the top of execute_query / paged_query.
|
||||
latency_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
@ -93,41 +72,12 @@ fn batches_to_json(batches: &[RecordBatch]) -> Result<serde_json::Value, String>
|
||||
serde_json::from_slice(&buf).map_err(|e| format!("JSON parse error: {e}"))
|
||||
}
|
||||
|
||||
/// Evaluate the request SQL against the configured TruthStore. Returns
|
||||
/// the Reject/Block message on the first failing mandatory rule so the
|
||||
/// handler can short-circuit. Returns None when all rules pass (or when
|
||||
/// the failures' declared action is non-mandatory like Redact/Pass).
|
||||
fn sql_policy_check(truth: &TruthStore, sql: &str) -> Option<String> {
|
||||
let ctx = serde_json::json!({ "sql": sql });
|
||||
for outcome in truth.evaluate("sql_query", &ctx) {
|
||||
if !outcome.passed {
|
||||
// FieldEmpty / FieldContainsAny etc. are enforced only when
|
||||
// condition HOLDS (i.e. passed=true). Below means "passed=false",
|
||||
// so the rule condition did not hold — no enforcement.
|
||||
continue;
|
||||
}
|
||||
match &outcome.action {
|
||||
RuleAction::Reject { message } | RuleAction::Block { message } => {
|
||||
return Some(message.clone());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
async fn execute_query(
|
||||
State(state): State<QueryState>,
|
||||
Json(req): Json<QueryRequest>,
|
||||
) -> impl IntoResponse {
|
||||
let started = std::time::Instant::now();
|
||||
tracing::info!("executing query: {}", req.sql);
|
||||
|
||||
if let Some(reason) = sql_policy_check(&state.truth, &req.sql) {
|
||||
tracing::warn!("sql rejected by truth gate: {reason}");
|
||||
return Err((StatusCode::FORBIDDEN, reason));
|
||||
}
|
||||
|
||||
match state.engine.query(&req.sql).await {
|
||||
Ok(batches) => {
|
||||
if batches.is_empty() {
|
||||
@ -135,7 +85,6 @@ async fn execute_query(
|
||||
columns: vec![],
|
||||
rows: serde_json::Value::Array(vec![]),
|
||||
row_count: 0,
|
||||
latency_ms: started.elapsed().as_millis() as u64,
|
||||
}));
|
||||
}
|
||||
|
||||
@ -154,7 +103,6 @@ async fn execute_query(
|
||||
columns,
|
||||
rows,
|
||||
row_count,
|
||||
latency_ms: started.elapsed().as_millis() as u64,
|
||||
}))
|
||||
}
|
||||
Err(e) => Err((StatusCode::BAD_REQUEST, e)),
|
||||
@ -168,10 +116,6 @@ async fn paged_query(
|
||||
Json(req): Json<QueryRequest>,
|
||||
) -> impl IntoResponse {
|
||||
tracing::info!("paged query: {}", req.sql);
|
||||
if let Some(reason) = sql_policy_check(&state.truth, &req.sql) {
|
||||
tracing::warn!("paged sql rejected by truth gate: {reason}");
|
||||
return Err((StatusCode::FORBIDDEN, reason));
|
||||
}
|
||||
match state.result_store.execute_and_store(&state.engine, &req.sql).await {
|
||||
Ok(handle) => Ok(Json(handle)),
|
||||
Err(e) => Err((StatusCode::BAD_REQUEST, e)),
|
||||
@ -268,65 +212,3 @@ async fn compact_dataset(
|
||||
Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod sql_policy_tests {
|
||||
use super::*;
|
||||
use truth::sql_query_guard_store;
|
||||
|
||||
// These tests exercise the policy gate without spinning up a DataFusion
|
||||
// engine — they only need `TruthStore`. Purpose: prove the P42-002
|
||||
// enforcement point actually rejects destructive SQL. This is the
|
||||
// regression guard for the queryd/service.rs finding that looped
|
||||
// across scrum iters 3-5.
|
||||
|
||||
#[test]
|
||||
fn blocks_drop_table() {
|
||||
let store = sql_query_guard_store();
|
||||
let reason = sql_policy_check(&store, "DROP TABLE users").expect("must reject");
|
||||
assert!(reason.contains("destructive"), "reason: {reason}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blocks_delete_from() {
|
||||
let store = sql_query_guard_store();
|
||||
assert!(sql_policy_check(&store, "delete from t where 1=1").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blocks_truncate() {
|
||||
let store = sql_query_guard_store();
|
||||
assert!(sql_policy_check(&store, "TRUNCATE workers").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blocks_empty_sql() {
|
||||
let store = sql_query_guard_store();
|
||||
assert!(sql_policy_check(&store, "").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allows_benign_select() {
|
||||
let store = sql_query_guard_store();
|
||||
assert!(sql_policy_check(&store, "SELECT count(*) FROM workers").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allows_select_with_deleted_word_in_column() {
|
||||
// Substring match is narrow ("delete from", not "delete"), so a
|
||||
// column named `deleted_at` doesn't trip the guard. Important
|
||||
// check — false positives on benign queries would make the gate
|
||||
// unusable in practice.
|
||||
let store = sql_query_guard_store();
|
||||
assert!(
|
||||
sql_policy_check(&store, "SELECT deleted_at FROM t").is_none(),
|
||||
"column names containing 'delete' must not be rejected"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn case_insensitive_match_catches_mixed_case() {
|
||||
let store = sql_query_guard_store();
|
||||
assert!(sql_policy_check(&store, "Drop Table X").is_some());
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,12 +2,15 @@
|
||||
/// Each workspace tracks an agent's activity on a specific contract or search,
|
||||
/// with daily/weekly/monthly tiers and instant handoff capability.
|
||||
|
||||
use arrow::array::{ArrayRef, RecordBatch, StringArray, Int64Array};
|
||||
use arrow::datatypes::{DataType, Field, Schema};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::delta;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
/// Retention tier for workspace data.
|
||||
|
||||
@ -62,15 +62,6 @@ pub struct GatewayConfig {
|
||||
pub host: String,
|
||||
#[serde(default = "default_gateway_port")]
|
||||
pub port: u16,
|
||||
/// Coordinator session JSONL output path. One row per
|
||||
/// `/v1/iterate` session, schema=`session.iterate.v1`. Empty =
|
||||
/// disabled. Cross-runtime parity with the Go side's
|
||||
/// `[validatord].session_log_path` (added 2026-05-02). Default
|
||||
/// empty so existing deployments aren't perturbed; production
|
||||
/// sets `/var/lib/lakehouse/gateway/sessions.jsonl`. See
|
||||
/// `golangLAKEHOUSE/docs/SESSION_LOG.md` for query examples.
|
||||
#[serde(default)]
|
||||
pub session_log_path: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
@ -158,13 +149,7 @@ fn default_gateway_port() -> u16 { 3100 }
|
||||
fn default_storage_root() -> String { "./data".to_string() }
|
||||
fn default_profile_root() -> String { "./data/_profiles".to_string() }
|
||||
fn default_manifest_prefix() -> String { "_catalog/manifests".to_string() }
|
||||
// Post-2026-05-02: AiClient talks directly to Ollama; the Python
|
||||
// sidecar's hot-path role was retired. The config field name
|
||||
// `[sidecar].url` is preserved for migration compatibility (operators
|
||||
// with existing TOMLs don't need to rename anything), but the value
|
||||
// now points at Ollama. Lab UI / pipeline_lab Python remains as a
|
||||
// dev-only tool; not on this URL.
|
||||
fn default_sidecar_url() -> String { "http://localhost:11434".to_string() }
|
||||
fn default_sidecar_url() -> String { "http://localhost:3200".to_string() }
|
||||
fn default_embed_model() -> String { "nomic-embed-text".to_string() }
|
||||
fn default_gen_model() -> String { "qwen2.5".to_string() }
|
||||
fn default_rerank_model() -> String { "qwen2.5".to_string() }
|
||||
@ -199,11 +184,7 @@ impl Config {
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gateway: GatewayConfig {
|
||||
host: default_host(),
|
||||
port: default_gateway_port(),
|
||||
session_log_path: String::new(),
|
||||
},
|
||||
gateway: GatewayConfig { host: default_host(), port: default_gateway_port() },
|
||||
storage: StorageConfig {
|
||||
root: default_storage_root(),
|
||||
profile_root: default_profile_root(),
|
||||
|
||||
@ -4,5 +4,3 @@ pub mod arrow_helpers;
|
||||
pub mod config;
|
||||
pub mod pii;
|
||||
pub mod secrets;
|
||||
pub mod model_matrix;
|
||||
pub mod profiles;
|
||||
|
||||
@ -1,69 +0,0 @@
|
||||
//! Per-model token accounting. Entry point for the ModelMatrix work
|
||||
//! the aibridge `context::estimate_tokens` deprecation has been pointing
|
||||
//! at. Starts minimal — just `estimate_tokens` — so call sites can
|
||||
//! migrate off the deprecated helper. Extend with per-model context
|
||||
//! windows, max_tokens defaults, provider hints, etc. as we move the
|
||||
//! rest of `aibridge::context::known_windows` over.
|
||||
|
||||
/// Namespace for per-model token + context accounting. Methods are
|
||||
/// associated functions — no instance required — because the underlying
|
||||
/// estimates are deterministic and stateless.
|
||||
pub struct ModelMatrix;
|
||||
|
||||
impl ModelMatrix {
|
||||
/// Rough token count — char count divided by 4, rounded up. This
|
||||
/// is the same heuristic OpenAI's cookbook uses for English text;
|
||||
/// it's within ±15% of BPE tokenizers for code + prose and doesn't
|
||||
/// require a tokenizer lookup. Good enough for budget math where
|
||||
/// the goal is "don't blow the context window" rather than exact
|
||||
/// billing.
|
||||
///
|
||||
/// Moved from `aibridge::context::estimate_tokens` (still there with
|
||||
/// a `#[deprecated]` pointer — callers should migrate here). Empty
|
||||
/// string → 0; one char → 1 (ceiling of 1/4 = 1).
|
||||
pub fn estimate_tokens(text: &str) -> usize {
|
||||
(text.chars().count() + 3) / 4
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::ModelMatrix;
|
||||
|
||||
#[test]
|
||||
fn empty_string_is_zero_tokens() {
|
||||
assert_eq!(ModelMatrix::estimate_tokens(""), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_chars_is_one_token() {
|
||||
// 3 → ceil(3/4) = 1. Matches the deprecated helper's behavior
|
||||
// so the migration is a drop-in replacement.
|
||||
assert_eq!(ModelMatrix::estimate_tokens("abc"), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn four_chars_is_one_token() {
|
||||
assert_eq!(ModelMatrix::estimate_tokens("abcd"), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn five_chars_is_two_tokens() {
|
||||
assert_eq!(ModelMatrix::estimate_tokens("abcde"), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn counts_chars_not_bytes() {
|
||||
// Multi-byte UTF-8 chars count as 1 char each — important for
|
||||
// prompts with emoji or non-ASCII text. "héllo" is 5 chars
|
||||
// (5 unicode scalars) → ceil(5/4) = 2 tokens, same as "hello".
|
||||
assert_eq!(ModelMatrix::estimate_tokens("héllo"), 2);
|
||||
assert_eq!(ModelMatrix::estimate_tokens("📚📚📚📚"), 1); // 4 chars
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn large_text_scales_linearly() {
|
||||
assert_eq!(ModelMatrix::estimate_tokens(&"x".repeat(400)), 100);
|
||||
assert_eq!(ModelMatrix::estimate_tokens(&"x".repeat(401)), 101);
|
||||
}
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
//! ExecutionProfile — the Phase 41 rename of Phase 17's ModelProfile.
|
||||
//!
|
||||
//! Carries what's needed to RUN inference: model tag, dataset bindings,
|
||||
//! HNSW config, embed model, bucket binding. Today this is a type
|
||||
//! alias over `crate::types::ModelProfile` — the PRD's
|
||||
//! "Backward compat: ModelProfile still loads, aliased to
|
||||
//! ExecutionProfile" line, honored literally.
|
||||
//!
|
||||
//! When the migration off the old name finishes, this file can either
|
||||
//! absorb the full struct definition or continue as an alias. Callers
|
||||
//! should reference `ExecutionProfile` going forward; `ModelProfile`
|
||||
//! stays exported from `types` for on-disk schema compat.
|
||||
|
||||
pub use crate::types::ModelProfile as ExecutionProfile;
|
||||
@ -1,38 +0,0 @@
|
||||
//! MemoryProfile — how the agent's execution memory is kept.
|
||||
//!
|
||||
//! Phase 41 decomposition: the Phase 19 playbook_memory + Phase 26
|
||||
//! successful_playbooks + Phase 45 doc_refs all need per-profile
|
||||
//! tuning. Rather than bolt those onto ExecutionProfile, they live
|
||||
//! here so a "thin" execution profile can reuse a "fat" memory
|
||||
//! profile and vice versa.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct MemoryProfile {
|
||||
pub id: String,
|
||||
#[serde(default)]
|
||||
pub description: String,
|
||||
/// Phase 19: ceiling for playbook_memory boost on retrieval. 0
|
||||
/// disables the boost entirely.
|
||||
#[serde(default = "default_boost_ceiling")]
|
||||
pub playbook_boost_ceiling: f32,
|
||||
/// Phase 26: max history entries retained before rotation.
|
||||
#[serde(default = "default_history_cap")]
|
||||
pub history_cap: usize,
|
||||
/// Phase 45: stale threshold for doc_refs before drift check
|
||||
/// fires (hours).
|
||||
#[serde(default = "default_stale_hours")]
|
||||
pub doc_stale_hours: u32,
|
||||
/// Phase 28: auto-retire playbooks that fail 3+ consecutive runs.
|
||||
#[serde(default = "default_true")]
|
||||
pub auto_retire_on_failure: bool,
|
||||
pub created_at: chrono::DateTime<chrono::Utc>,
|
||||
#[serde(default)]
|
||||
pub created_by: String,
|
||||
}
|
||||
|
||||
fn default_boost_ceiling() -> f32 { 0.35 }
|
||||
fn default_history_cap() -> usize { 1000 }
|
||||
fn default_stale_hours() -> u32 { 168 } // one week
|
||||
fn default_true() -> bool { true }
|
||||
@ -1,28 +0,0 @@
|
||||
//! Phase 41 profile types.
|
||||
//!
|
||||
//! The existing `ModelProfile` (Phase 17) is aliased as
|
||||
//! `ExecutionProfile` here — it continues to carry the model +
|
||||
//! bindings + HNSW config needed to run inference. Three new profile
|
||||
//! types land alongside: `RetrievalProfile`, `MemoryProfile`,
|
||||
//! `ObserverProfile` — each owns a distinct slice of what used to be
|
||||
//! bundled.
|
||||
//!
|
||||
//! Backward-compat rule (PRD Phase 41): existing `ModelProfile` on
|
||||
//! disk continues to deserialize unchanged. New fields on the new
|
||||
//! profile types are `#[serde(default)]` so old payloads load with
|
||||
//! empty defaults.
|
||||
//!
|
||||
//! These are the canonical shapes — downstream code converts via
|
||||
//! `From<ModelProfile> for ExecutionProfile` (they're the same struct
|
||||
//! today, just named differently) and constructs the other three
|
||||
//! as needed.
|
||||
|
||||
pub mod execution;
|
||||
pub mod retrieval;
|
||||
pub mod memory;
|
||||
pub mod observer;
|
||||
|
||||
pub use execution::ExecutionProfile;
|
||||
pub use memory::MemoryProfile;
|
||||
pub use observer::ObserverProfile;
|
||||
pub use retrieval::RetrievalProfile;
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user