local-review-harness/internal/llm/ollama.go

// Ollama provider — local-first per PROMPT.md.
//
// HealthCheck: probes /api/tags (server up + model list) + a 1-token
// completion + a strict-JSON probe. Used by `model doctor`.
//
// Complete + CompleteJSON: POST /api/chat with stream=false. JSON
// mode uses Ollama's native `format: "json"` — newer Ollama versions
// also accept a JSON Schema there but format=json is the lowest-
// common-denominator that works back to 0.4.
//
// `think: false` is set for ALL completions per the Lakehouse-Go
// 2026-04-30 finding: qwen3.5:latest and qwen3:latest are reasoning-
// capable but the inner-loop hot path wants direct answers, not
// `<think>` traces consuming the token budget. Callers that NEED
// reasoning override via opts (Phase F+, not yet wired).
package llm

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// OllamaProvider is the concrete impl. Stateless; safe for concurrent
// use (the http.Client handles connection pooling).
type OllamaProvider struct {
	baseURL    string
	httpClient *http.Client
}

// NewOllama returns a provider pointed at baseURL. Empty baseURL
// defaults to http://localhost:11434. timeout 0 → 120s (matches
// model-profile default).
func NewOllama(baseURL string, timeout time.Duration) *OllamaProvider {
	if baseURL == "" {
		baseURL = "http://localhost:11434"
	}
	if timeout == 0 {
		timeout = 120 * time.Second
	}
	return &OllamaProvider{
		baseURL:    strings.TrimRight(baseURL, "/"),
		httpClient: &http.Client{Timeout: timeout},
	}
}

func (o *OllamaProvider) Name() string { return "ollama" }

// HealthCheck runs the 5 probes documented in REPORT_SCHEMA.md
// model-doctor.json shape:
//   - server_available: GET /api/tags returns 2xx
//   - primary_model_available: name appears in tag list
//   - fallback_model_available: name appears in tag list
//   - basic_prompt_ok: a 5-token "reply OK" round-trips
//   - json_mode_ok: a JSON probe parses cleanly
//
// Errors surface in HealthStatus.Errors as human-readable strings
// (no stack trace shape — operators run this from a shell).
func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback string) HealthStatus {
	st := HealthStatus{Errors: []string{}}

	// 1. Server availability + model list
	tags, err := o.listTags(ctx)
	if err != nil {
		st.Errors = append(st.Errors, "list models: "+err.Error())
		return st
	}
	st.ServerAvailable = true

	loaded := map[string]bool{}
	for _, t := range tags {
		loaded[t] = true
	}
	st.PrimaryModelAvailable = primary != "" && loaded[primary]
	st.FallbackModelAvailable = fallback != "" && loaded[fallback]

	// Pick the model we'll use for the live probes — primary if
	// loaded, else fallback, else the first model Ollama has.
	probeModel := ""
	switch {
	case st.PrimaryModelAvailable:
		probeModel = primary
	case st.FallbackModelAvailable:
		probeModel = fallback
	case len(tags) > 0:
		probeModel = tags[0]
		st.Errors = append(st.Errors,
			fmt.Sprintf("neither primary %q nor fallback %q loaded; using %q for liveness probe", primary, fallback, probeModel))
	default:
		st.Errors = append(st.Errors, "no models loaded; can't run liveness probe")
		return st
	}

	// 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK,
	// 2026-04-30): checks that the response actually contains "OK"
	// (case-insensitive, substring) — pre-fix accepted any non-empty
	// string, so a thinking-model's `<think>...` trace or an apology
	// passed silently. Substring rather than equality because some
	// models surround the answer with whitespace or a trailing period.
	if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
		st.Errors = append(st.Errors, "basic prompt: "+err.Error())
	} else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") {
		st.BasicPromptOK = true
	} else {
		st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80)))
	}

	// 3. JSON-mode completion
	jsonGot, err := o.CompleteJSON(ctx, probeModel, `Output exactly this JSON and nothing else: {"ok": true}`, CompleteOptions{Temperature: 0, MaxTokens: 32, TimeoutSeconds: 30})
	if err != nil {
		st.Errors = append(st.Errors, "json mode: "+err.Error())
	} else {
		var probe struct{ Ok bool }
		if json.Unmarshal([]byte(jsonGot), &probe) == nil {
			st.JSONModeOK = true
		} else {
			st.Errors = append(st.Errors, "json mode: parse failed; raw="+abbrev(jsonGot, 200))
		}
	}

	return st
}

// Complete posts to /api/chat with stream=false. Returns just the
// assistant content; token counts not surfaced (callers that need
// them go via the chat-shape API directly, which we'll expose later).
func (o *OllamaProvider) Complete(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
	body := o.chatBody(model, prompt, opts, false)
	return o.postChat(ctx, body, opts)
}

// CompleteJSON requests Ollama's native JSON-mode constrained output.
// The `format: "json"` field forces grammar-constrained generation —
// the model can only emit valid JSON. Some models still emit garbage
// in the content field (e.g. preamble text); validation is the
// caller's job (PROMPT.md "AI may suggest. Code validates.").
func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string, opts CompleteOptions) (string, error) {
	body := o.chatBody(model, prompt, opts, true)
	return o.postChat(ctx, body, opts)
}

func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
	options := map[string]any{}
	// Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the
	// caller-supplied Temperature, including 0. Pre-fix `if != 0`
	// silently dropped the field for callers wanting deterministic
	// generation, so Ollama's ~0.8 default applied to the JSON
	// probe + every reviewer call. CompleteOptions.Temperature is
	// still float64 (not *float64) — the harness's two callers
	// (HealthCheck, Reviewer) always set it explicitly, so "0
	// means 0" is the right semantic. If a future caller wants
	// "use Ollama default", they can set MaxTokens=0 + delete the
	// option (or we'll switch to *float64 like chatd did).
	options["temperature"] = opts.Temperature
	if opts.MaxTokens > 0 {
		options["num_predict"] = opts.MaxTokens
	}
	body := map[string]any{
		"model": model,
		"messages": []map[string]any{
			{"role": "user", "content": prompt},
		},
		"stream":  false,
		"think":   false, // local hot path skips reasoning by default
		"options": options,
	}
	if jsonMode {
		body["format"] = "json"
	}
	return body
}

func (o *OllamaProvider) postChat(ctx context.Context, body map[string]any, opts CompleteOptions) (string, error) {
	bs, _ := json.Marshal(body)
	req, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/chat", bytes.NewReader(bs))
	if err != nil {
		return "", err
	}
	req.Header.Set("Content-Type", "application/json")

	cli := o.httpClient
	if opts.TimeoutSeconds > 0 {
		cli = &http.Client{Timeout: time.Duration(opts.TimeoutSeconds) * time.Second}
	}
	resp, err := cli.Do(req)
	if err != nil {
		if errors.Is(ctx.Err(), context.DeadlineExceeded) {
			return "", fmt.Errorf("ollama timeout")
		}
		return "", fmt.Errorf("ollama request: %w", err)
	}
	defer resp.Body.Close()
	rb, _ := io.ReadAll(resp.Body)
	if resp.StatusCode/100 != 2 {
		return "", fmt.Errorf("ollama %d: %s", resp.StatusCode, abbrev(string(rb), 200))
	}
	var out struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
		Done       bool   `json:"done"`
		DoneReason string `json:"done_reason"`
	}
	if err := json.Unmarshal(rb, &out); err != nil {
		return "", fmt.Errorf("ollama decode: %w (body=%s)", err, abbrev(string(rb), 200))
	}
	return out.Message.Content, nil
}

// listTags hits /api/tags and returns the loaded-model name list.
func (o *OllamaProvider) listTags(ctx context.Context) ([]string, error) {
	cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
	defer cancel()
	req, _ := http.NewRequestWithContext(cctx, "GET", o.baseURL+"/api/tags", nil)
	resp, err := o.httpClient.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	if resp.StatusCode/100 != 2 {
		return nil, fmt.Errorf("status %d", resp.StatusCode)
	}
	rb, _ := io.ReadAll(resp.Body)
	var out struct {
		Models []struct {
			Name string `json:"name"`
		} `json:"models"`
	}
	if err := json.Unmarshal(rb, &out); err != nil {
		return nil, err
	}
	names := make([]string, 0, len(out.Models))
	for _, m := range out.Models {
		names = append(names, m.Name)
	}
	return names, nil
}

func abbrev(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}