Claude (review-harness setup) e346b54e0f Phase C — local-Ollama LLM review wired end-to-end
Implements PROMPT.md / docs/REVIEW_PIPELINE.md Phase 2:
- internal/llm/ollama.go — real Ollama provider:
  - HealthCheck probes /api/tags + a 1-token completion + a JSON-mode
    probe ({"ok": true} round-trip), populating the model-doctor.json
    schema documented in docs/LOCAL_MODEL_SETUP.md
  - Complete + CompleteJSON via /api/chat with stream=false
  - think=false set for ALL completions (qwen3.5:latest is reasoning-
    capable but the inner-loop hot path wants direct answers, not
    reasoning traces consuming the token budget — same finding as
    the Lakehouse-Go chatd 2026-04-30 wave)
- internal/llm/review.go — Reviewer wrapper:
  - 2-attempt flow: prompt → parse → repair-prompt → parse
  - Strict JSON shape enforced; markdown fences stripped before parse
  - Severity normalized to enum; out-of-range confidence clamped
  - Per-file chunking (file-level for v0; function-level Phase D+)
  - Bounded by review-profile max_file_bytes + max_llm_chunk_chars
- pipeline.go — Phase 2 wired between static scan + report gen:
  - --enable-llm flag opts in (off by default — static-only is
    cheaper and faster)
  - Raw output ALWAYS saved to llm-findings.raw.json (forensics)
  - Normalized findings → llm-findings.normalized.json
  - LLM findings merged into the report findings list (sourced
    "llm" so consumers can filter)
  - Receipts honestly mark phase status: "ok" | "degraded" | "skipped"
- cli model doctor — real probes replace the Phase A stub.

Verified:
- model doctor: status="ok" with qwen3.5:latest + qwen3:latest both
  loaded, basic_prompt_ok=true, json_mode_ok=true
- insecure-repo with --enable-llm: 9 LLM findings; qwen3.5 correctly
  flagged SQLi, RCE, hardcoded credentials as critical with verbatim
  evidence; 27s wall for 3 chunks
- clean-repo with --enable-llm: 0 LLM findings, 4 parsed chunks, 2.8s
- self-review with --enable-llm: 77 LLM findings + 83 static; 3 of
  ~30 chunks needed retry (PROMPT.md, REPORT_SCHEMA.md,
  SCRUM_TEST_TEMPLATE.md — all eventually parsed); 5min wall

go vet + go test -short clean. Fixture stray.go now `package fixture`
so go-tooling doesn't choke on the orphan.

Phase D (validator cross-check) + Phase E (memory + diff/rules
subcommands) remain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 01:13:39 -05:00

236 lines
7.5 KiB
Go

// Ollama provider — local-first per PROMPT.md.
//
// HealthCheck: probes /api/tags (server up + model list) + a 1-token
// completion + a strict-JSON probe. Used by `model doctor`.
//
// Complete + CompleteJSON: POST /api/chat with stream=false. JSON
// mode uses Ollama's native `format: "json"` — newer Ollama versions
// also accept a JSON Schema there but format=json is the lowest-
// common-denominator that works back to 0.4.
//
// `think: false` is set for ALL completions per the Lakehouse-Go
// 2026-04-30 finding: qwen3.5:latest and qwen3:latest are reasoning-
// capable but the inner-loop hot path wants direct answers, not
// `<think>` traces consuming the token budget. Callers that NEED
// reasoning override via opts (Phase F+, not yet wired).
package llm
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// OllamaProvider is the concrete impl. Stateless; safe for concurrent
// use (the http.Client handles connection pooling).
type OllamaProvider struct {
baseURL string
httpClient *http.Client
}
// NewOllama returns a provider pointed at baseURL. Empty baseURL
// defaults to http://localhost:11434. timeout 0 → 120s (matches
// model-profile default).
func NewOllama(baseURL string, timeout time.Duration) *OllamaProvider {
if baseURL == "" {
baseURL = "http://localhost:11434"
}
if timeout == 0 {
timeout = 120 * time.Second
}
return &OllamaProvider{
baseURL: strings.TrimRight(baseURL, "/"),
httpClient: &http.Client{Timeout: timeout},
}
}
func (o *OllamaProvider) Name() string { return "ollama" }
// HealthCheck runs the 5 probes documented in REPORT_SCHEMA.md
// model-doctor.json shape:
// - server_available: GET /api/tags returns 2xx
// - primary_model_available: name appears in tag list
// - fallback_model_available: name appears in tag list
// - basic_prompt_ok: a 5-token "reply OK" round-trips
// - json_mode_ok: a JSON probe parses cleanly
//
// Errors surface in HealthStatus.Errors as human-readable strings
// (no stack trace shape — operators run this from a shell).
func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback string) HealthStatus {
st := HealthStatus{Errors: []string{}}
// 1. Server availability + model list
tags, err := o.listTags(ctx)
if err != nil {
st.Errors = append(st.Errors, "list models: "+err.Error())
return st
}
st.ServerAvailable = true
loaded := map[string]bool{}
for _, t := range tags {
loaded[t] = true
}
st.PrimaryModelAvailable = primary != "" && loaded[primary]
st.FallbackModelAvailable = fallback != "" && loaded[fallback]
// Pick the model we'll use for the live probes — primary if
// loaded, else fallback, else the first model Ollama has.
probeModel := ""
switch {
case st.PrimaryModelAvailable:
probeModel = primary
case st.FallbackModelAvailable:
probeModel = fallback
case len(tags) > 0:
probeModel = tags[0]
st.Errors = append(st.Errors,
fmt.Sprintf("neither primary %q nor fallback %q loaded; using %q for liveness probe", primary, fallback, probeModel))
default:
st.Errors = append(st.Errors, "no models loaded; can't run liveness probe")
return st
}
// 2. Basic completion
if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
st.Errors = append(st.Errors, "basic prompt: "+err.Error())
} else if strings.TrimSpace(got) != "" {
st.BasicPromptOK = true
}
// 3. JSON-mode completion
jsonGot, err := o.CompleteJSON(ctx, probeModel, `Output exactly this JSON and nothing else: {"ok": true}`, CompleteOptions{Temperature: 0, MaxTokens: 32, TimeoutSeconds: 30})
if err != nil {
st.Errors = append(st.Errors, "json mode: "+err.Error())
} else {
var probe struct{ Ok bool }
if json.Unmarshal([]byte(jsonGot), &probe) == nil {
st.JSONModeOK = true
} else {
st.Errors = append(st.Errors, "json mode: parse failed; raw="+abbrev(jsonGot, 200))
}
}
return st
}
// Complete posts to /api/chat with stream=false. Returns just the
// assistant content; token counts not surfaced (callers that need
// them go via the chat-shape API directly, which we'll expose later).
func (o *OllamaProvider) Complete(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
body := o.chatBody(model, prompt, opts, false)
return o.postChat(ctx, body, opts)
}
// CompleteJSON requests Ollama's native JSON-mode constrained output.
// The `format: "json"` field forces grammar-constrained generation —
// the model can only emit valid JSON. Some models still emit garbage
// in the content field (e.g. preamble text); validation is the
// caller's job (PROMPT.md "AI may suggest. Code validates.").
func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
body := o.chatBody(model, prompt, opts, true)
return o.postChat(ctx, body, opts)
}
func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
options := map[string]any{}
if opts.Temperature != 0 {
options["temperature"] = opts.Temperature
}
if opts.MaxTokens > 0 {
options["num_predict"] = opts.MaxTokens
}
body := map[string]any{
"model": model,
"messages": []map[string]any{
{"role": "user", "content": prompt},
},
"stream": false,
"think": false, // local hot path skips reasoning by default
"options": options,
}
if jsonMode {
body["format"] = "json"
}
return body
}
func (o *OllamaProvider) postChat(ctx context.Context, body map[string]any, opts CompleteOptions) (string, error) {
bs, _ := json.Marshal(body)
req, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/chat", bytes.NewReader(bs))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
cli := o.httpClient
if opts.TimeoutSeconds > 0 {
cli = &http.Client{Timeout: time.Duration(opts.TimeoutSeconds) * time.Second}
}
resp, err := cli.Do(req)
if err != nil {
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
return "", fmt.Errorf("ollama timeout")
}
return "", fmt.Errorf("ollama request: %w", err)
}
defer resp.Body.Close()
rb, _ := io.ReadAll(resp.Body)
if resp.StatusCode/100 != 2 {
return "", fmt.Errorf("ollama %d: %s", resp.StatusCode, abbrev(string(rb), 200))
}
var out struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
Done bool `json:"done"`
DoneReason string `json:"done_reason"`
}
if err := json.Unmarshal(rb, &out); err != nil {
return "", fmt.Errorf("ollama decode: %w (body=%s)", err, abbrev(string(rb), 200))
}
return out.Message.Content, nil
}
// listTags hits /api/tags and returns the loaded-model name list.
func (o *OllamaProvider) listTags(ctx context.Context) ([]string, error) {
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
req, _ := http.NewRequestWithContext(cctx, "GET", o.baseURL+"/api/tags", nil)
resp, err := o.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
return nil, fmt.Errorf("status %d", resp.StatusCode)
}
rb, _ := io.ReadAll(resp.Body)
var out struct {
Models []struct {
Name string `json:"name"`
} `json:"models"`
}
if err := json.Unmarshal(rb, &out); err != nil {
return nil, err
}
names := make([]string, 0, len(out.Models))
for _, m := range out.Models {
names = append(names, m.Name)
}
return names, nil
}
func abbrev(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}