Cross-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder via chatd's
/v1/chat) on the harness's first 4 commits surfaced 5 real bugs;
this commit lands the 4 inside the LLM/validator stack. B5 (scanner
skip-list semantics) ships separately as it changes scan behavior
on every target repo.
B1 (Kimi BLOCK + Opus WARN convergent) — internal/validators:
evidencePresent had two flaws: (1) cursor advanced on match in the
trim-line fallback, breaking same-line repeated matches AND skipping
not-yet-considered lines so out-of-order evidence spuriously failed;
(2) strings.Contains on a single `}` trim-matched any closing brace
in the file, defeating the "evidence quotes real text" contract.
Fix: trivial-evidence guard FIRST (reject anything <4 non-whitespace
chars) + per-line search no longer advances a cursor. New regression
test TestEvidencePresent_RejectsTrivialMatches covers `}`, `{`, `)`,
empty, and out-of-order multi-line evidence (which now passes —
order isn't part of the contract).
B2 (Kimi WARN + Opus WARN convergent) — internal/pipeline:
WriteJSON error for rejected-findings.json was swallowed with
`if err == nil`, so a write failure left the validation phase
reporting status="ok" while the audit trail vanished. Mirror the
validated-findings branch: surface the error in
validatePhase.Errors + bump status to degraded + ExitCode=66.
B3 (Kimi BLOCK + Opus BLOCK convergent) — internal/llm/ollama.go:
HealthCheck.basic_prompt_ok was set to true on ANY non-empty
response, so a model emitting `<think>...` traces or apologies
passed silently. Now requires the response to contain "OK"
(uppercase, substring). Substring rather than equality lets minor
whitespace/punctuation variations through (some models add a
trailing period). Errors now record what the model actually said
when it fails the check.
B4 (Opus BLOCK only — same class as today's chatd Anthropic-temp
fix) — internal/llm/ollama.go: chatBody had `if opts.Temperature != 0`
which silently dropped Temperature=0 from the request, so HealthCheck
+ Reviewer (both pass Temperature=0 expecting determinism) actually
ran at Ollama's ~0.8 default. Always forward Temperature now. The
two callers always set explicit values, so "0 means 0" is correct;
if a future caller wants Ollama's default they'll switch
CompleteOptions.Temperature to *float64 like chatd did this morning.
Verified end-to-end: insecure-repo + --enable-llm still produces 25
confirmed findings (16 static + 9 LLM), 0 rejected. Validator unit
tests: 11 pass (added TestEvidencePresent_RejectsTrivialMatches).
Same-day-as-shipping scrum, same-day-as-shipping fixes. The
convergent-≥2 gate caught 3 of these; the 4th was Opus-only but
verified by reading the code (same idiom as today's chatd bug).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
251 lines
8.6 KiB
Go
251 lines
8.6 KiB
Go
// Ollama provider — local-first per PROMPT.md.
|
|
//
|
|
// HealthCheck: probes /api/tags (server up + model list) + a 1-token
|
|
// completion + a strict-JSON probe. Used by `model doctor`.
|
|
//
|
|
// Complete + CompleteJSON: POST /api/chat with stream=false. JSON
|
|
// mode uses Ollama's native `format: "json"` — newer Ollama versions
|
|
// also accept a JSON Schema there but format=json is the lowest-
|
|
// common-denominator that works back to 0.4.
|
|
//
|
|
// `think: false` is set for ALL completions per the Lakehouse-Go
|
|
// 2026-04-30 finding: qwen3.5:latest and qwen3:latest are reasoning-
|
|
// capable but the inner-loop hot path wants direct answers, not
|
|
// `<think>` traces consuming the token budget. Callers that NEED
|
|
// reasoning override via opts (Phase F+, not yet wired).
|
|
package llm
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// OllamaProvider is the concrete impl. Stateless; safe for concurrent
|
|
// use (the http.Client handles connection pooling).
|
|
type OllamaProvider struct {
|
|
baseURL string
|
|
httpClient *http.Client
|
|
}
|
|
|
|
// NewOllama returns a provider pointed at baseURL. Empty baseURL
|
|
// defaults to http://localhost:11434. timeout 0 → 120s (matches
|
|
// model-profile default).
|
|
func NewOllama(baseURL string, timeout time.Duration) *OllamaProvider {
|
|
if baseURL == "" {
|
|
baseURL = "http://localhost:11434"
|
|
}
|
|
if timeout == 0 {
|
|
timeout = 120 * time.Second
|
|
}
|
|
return &OllamaProvider{
|
|
baseURL: strings.TrimRight(baseURL, "/"),
|
|
httpClient: &http.Client{Timeout: timeout},
|
|
}
|
|
}
|
|
|
|
func (o *OllamaProvider) Name() string { return "ollama" }
|
|
|
|
// HealthCheck runs the 5 probes documented in REPORT_SCHEMA.md
|
|
// model-doctor.json shape:
|
|
// - server_available: GET /api/tags returns 2xx
|
|
// - primary_model_available: name appears in tag list
|
|
// - fallback_model_available: name appears in tag list
|
|
// - basic_prompt_ok: a 5-token "reply OK" round-trips
|
|
// - json_mode_ok: a JSON probe parses cleanly
|
|
//
|
|
// Errors surface in HealthStatus.Errors as human-readable strings
|
|
// (no stack trace shape — operators run this from a shell).
|
|
func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback string) HealthStatus {
|
|
st := HealthStatus{Errors: []string{}}
|
|
|
|
// 1. Server availability + model list
|
|
tags, err := o.listTags(ctx)
|
|
if err != nil {
|
|
st.Errors = append(st.Errors, "list models: "+err.Error())
|
|
return st
|
|
}
|
|
st.ServerAvailable = true
|
|
|
|
loaded := map[string]bool{}
|
|
for _, t := range tags {
|
|
loaded[t] = true
|
|
}
|
|
st.PrimaryModelAvailable = primary != "" && loaded[primary]
|
|
st.FallbackModelAvailable = fallback != "" && loaded[fallback]
|
|
|
|
// Pick the model we'll use for the live probes — primary if
|
|
// loaded, else fallback, else the first model Ollama has.
|
|
probeModel := ""
|
|
switch {
|
|
case st.PrimaryModelAvailable:
|
|
probeModel = primary
|
|
case st.FallbackModelAvailable:
|
|
probeModel = fallback
|
|
case len(tags) > 0:
|
|
probeModel = tags[0]
|
|
st.Errors = append(st.Errors,
|
|
fmt.Sprintf("neither primary %q nor fallback %q loaded; using %q for liveness probe", primary, fallback, probeModel))
|
|
default:
|
|
st.Errors = append(st.Errors, "no models loaded; can't run liveness probe")
|
|
return st
|
|
}
|
|
|
|
// 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK,
|
|
// 2026-04-30): checks that the response actually contains "OK"
|
|
// (case-insensitive, substring) — pre-fix accepted any non-empty
|
|
// string, so a thinking-model's `<think>...` trace or an apology
|
|
// passed silently. Substring rather than equality because some
|
|
// models surround the answer with whitespace or a trailing period.
|
|
if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
|
|
st.Errors = append(st.Errors, "basic prompt: "+err.Error())
|
|
} else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") {
|
|
st.BasicPromptOK = true
|
|
} else {
|
|
st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80)))
|
|
}
|
|
|
|
// 3. JSON-mode completion
|
|
jsonGot, err := o.CompleteJSON(ctx, probeModel, `Output exactly this JSON and nothing else: {"ok": true}`, CompleteOptions{Temperature: 0, MaxTokens: 32, TimeoutSeconds: 30})
|
|
if err != nil {
|
|
st.Errors = append(st.Errors, "json mode: "+err.Error())
|
|
} else {
|
|
var probe struct{ Ok bool }
|
|
if json.Unmarshal([]byte(jsonGot), &probe) == nil {
|
|
st.JSONModeOK = true
|
|
} else {
|
|
st.Errors = append(st.Errors, "json mode: parse failed; raw="+abbrev(jsonGot, 200))
|
|
}
|
|
}
|
|
|
|
return st
|
|
}
|
|
|
|
// Complete posts to /api/chat with stream=false. Returns just the
|
|
// assistant content; token counts not surfaced (callers that need
|
|
// them go via the chat-shape API directly, which we'll expose later).
|
|
func (o *OllamaProvider) Complete(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
|
|
body := o.chatBody(model, prompt, opts, false)
|
|
return o.postChat(ctx, body, opts)
|
|
}
|
|
|
|
// CompleteJSON requests Ollama's native JSON-mode constrained output.
|
|
// The `format: "json"` field forces grammar-constrained generation —
|
|
// the model can only emit valid JSON. Some models still emit garbage
|
|
// in the content field (e.g. preamble text); validation is the
|
|
// caller's job (PROMPT.md "AI may suggest. Code validates.").
|
|
func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
|
|
body := o.chatBody(model, prompt, opts, true)
|
|
return o.postChat(ctx, body, opts)
|
|
}
|
|
|
|
func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
|
|
options := map[string]any{}
|
|
// Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the
|
|
// caller-supplied Temperature, including 0. Pre-fix `if != 0`
|
|
// silently dropped the field for callers wanting deterministic
|
|
// generation, so Ollama's ~0.8 default applied to the JSON
|
|
// probe + every reviewer call. CompleteOptions.Temperature is
|
|
// still float64 (not *float64) — the harness's two callers
|
|
// (HealthCheck, Reviewer) always set it explicitly, so "0
|
|
// means 0" is the right semantic. If a future caller wants
|
|
// "use Ollama default", they can set MaxTokens=0 + delete the
|
|
// option (or we'll switch to *float64 like chatd did).
|
|
options["temperature"] = opts.Temperature
|
|
if opts.MaxTokens > 0 {
|
|
options["num_predict"] = opts.MaxTokens
|
|
}
|
|
body := map[string]any{
|
|
"model": model,
|
|
"messages": []map[string]any{
|
|
{"role": "user", "content": prompt},
|
|
},
|
|
"stream": false,
|
|
"think": false, // local hot path skips reasoning by default
|
|
"options": options,
|
|
}
|
|
if jsonMode {
|
|
body["format"] = "json"
|
|
}
|
|
return body
|
|
}
|
|
|
|
func (o *OllamaProvider) postChat(ctx context.Context, body map[string]any, opts CompleteOptions) (string, error) {
|
|
bs, _ := json.Marshal(body)
|
|
req, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/chat", bytes.NewReader(bs))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
cli := o.httpClient
|
|
if opts.TimeoutSeconds > 0 {
|
|
cli = &http.Client{Timeout: time.Duration(opts.TimeoutSeconds) * time.Second}
|
|
}
|
|
resp, err := cli.Do(req)
|
|
if err != nil {
|
|
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
|
return "", fmt.Errorf("ollama timeout")
|
|
}
|
|
return "", fmt.Errorf("ollama request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode/100 != 2 {
|
|
return "", fmt.Errorf("ollama %d: %s", resp.StatusCode, abbrev(string(rb), 200))
|
|
}
|
|
var out struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
Done bool `json:"done"`
|
|
DoneReason string `json:"done_reason"`
|
|
}
|
|
if err := json.Unmarshal(rb, &out); err != nil {
|
|
return "", fmt.Errorf("ollama decode: %w (body=%s)", err, abbrev(string(rb), 200))
|
|
}
|
|
return out.Message.Content, nil
|
|
}
|
|
|
|
// listTags hits /api/tags and returns the loaded-model name list.
|
|
func (o *OllamaProvider) listTags(ctx context.Context) ([]string, error) {
|
|
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
req, _ := http.NewRequestWithContext(cctx, "GET", o.baseURL+"/api/tags", nil)
|
|
resp, err := o.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
return nil, fmt.Errorf("status %d", resp.StatusCode)
|
|
}
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
var out struct {
|
|
Models []struct {
|
|
Name string `json:"name"`
|
|
} `json:"models"`
|
|
}
|
|
if err := json.Unmarshal(rb, &out); err != nil {
|
|
return nil, err
|
|
}
|
|
names := make([]string, 0, len(out.Models))
|
|
for _, m := range out.Models {
|
|
names = append(names, m.Name)
|
|
}
|
|
return names, nil
|
|
}
|
|
|
|
func abbrev(s string, n int) string {
|
|
if len(s) <= n {
|
|
return s
|
|
}
|
|
return s[:n] + "…"
|
|
}
|