golangLAKEHOUSE/internal/chat/ollama.go

package chat

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// Ollama (local) provider — calls /api/chat on the local Ollama
// server. No auth needed; default URL http://localhost:11434.
//
// Bare model names route here by default (registry.defaultName=ollama),
// so "qwen3.5:latest" → ollama. Explicit "ollama/qwen3.5:latest" also
// works (prefix stripped).
type Ollama struct {
	baseURL    string
	httpClient *http.Client
}

// NewOllama returns a local Ollama provider. baseURL defaults to
// http://localhost:11434 when empty. timeout 0 → 180s.
func NewOllama(baseURL string, timeout time.Duration) *Ollama {
	if baseURL == "" {
		baseURL = "http://localhost:11434"
	}
	if timeout == 0 {
		timeout = 180 * time.Second
	}
	return &Ollama{
		baseURL:    strings.TrimRight(baseURL, "/"),
		httpClient: &http.Client{Timeout: timeout},
	}
}

func (o *Ollama) Name() string { return "ollama" }

// Available pings /api/tags. Cached negative result would be a
// premature optimization for G0 — Ollama is typically up. If down,
// next call gets ErrUpstream which is the right signal anyway.
func (o *Ollama) Available() bool {
	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
	defer cancel()
	req, _ := http.NewRequestWithContext(ctx, "GET", o.baseURL+"/api/tags", nil)
	resp, err := o.httpClient.Do(req)
	if err != nil {
		return false
	}
	defer resp.Body.Close()
	return resp.StatusCode/100 == 2
}

// Chat translates Request to Ollama's /api/chat shape and back.
// Strips the optional "ollama/" prefix from req.Model.
func (o *Ollama) Chat(ctx context.Context, req Request) (*Response, error) {
	model := StripPrefix(req.Model, "ollama")

	options := map[string]any{}
	// Pointer-valued temperature so "not set" (nil) doesn't overwrite
	// Ollama's default. Only forward when caller set it explicitly.
	if req.Temperature != nil {
		options["temperature"] = *req.Temperature
	}
	if req.MaxTokens > 0 {
		options["num_predict"] = req.MaxTokens
	}
	body := map[string]any{
		"model":    model,
		"messages": req.Messages,
		"stream":   false,
		// Local hot path: skip reasoning by default. qwen3 / qwen3.5 are
		// thinking-capable but the inner-loop use case wants direct
		// answers, not reasoning traces. Without this, low max_tokens
		// budgets get consumed by thinking before any content is
		// produced. Cloud tier (Ollama Cloud) inherits the same default
		// — see ollama_cloud.go.
		"think":   false,
		"options": options,
	}
	if req.Format == "json" {
		body["format"] = "json"
	}

	bs, _ := json.Marshal(body)
	httpReq, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/chat", bytes.NewReader(bs))
	if err != nil {
		return nil, err
	}
	httpReq.Header.Set("Content-Type", "application/json")

	resp, err := o.httpClient.Do(httpReq)
	if err != nil {
		if errors.Is(ctx.Err(), context.DeadlineExceeded) {
			return nil, fmt.Errorf("%w: %s", ErrTimeout, "ollama")
		}
		return nil, fmt.Errorf("ollama: %w", err)
	}
	defer resp.Body.Close()

	rb, _ := io.ReadAll(resp.Body)
	if resp.StatusCode/100 != 2 {
		return nil, fmt.Errorf("%w: ollama %d: %s", ErrUpstream, resp.StatusCode, abbrev(string(rb), 200))
	}

	var ollamaResp struct {
		Model   string `json:"model"`
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
		Done            bool   `json:"done"`
		DoneReason      string `json:"done_reason"` // "stop", "length", ...
		PromptEvalCount int    `json:"prompt_eval_count"`
		EvalCount       int    `json:"eval_count"`
	}
	if err := json.Unmarshal(rb, &ollamaResp); err != nil {
		return nil, fmt.Errorf("ollama decode: %w (body=%s)", err, abbrev(string(rb), 200))
	}

	return &Response{
		Model:        model,
		Content:      ollamaResp.Message.Content,
		InputTokens:  ollamaResp.PromptEvalCount,
		OutputTokens: ollamaResp.EvalCount,
		FinishReason: finishReasonFromOllama(ollamaResp.Done, ollamaResp.DoneReason),
	}, nil
}

// finishReasonFromOllama prefers Ollama's done_reason when present
// (newer Ollama 0.4+ exposes this on /api/chat). Falls back to the
// done bool for older versions. Phase 4 scrum fix B-4 (Opus WARN):
// previous logic mapped `done==true` → "stop" unconditionally, which
// hid truncations that Ollama reports as `done=true, done_reason="length"`.
// The playbook_lift judge needs this signal to detect when max_tokens
// budget ran out before the answer completed.
func finishReasonFromOllama(done bool, doneReason string) string {
	if doneReason != "" {
		return doneReason
	}
	if done {
		return "stop"
	}
	return "length"
}

// abbrev shortens long error bodies for log/error messages without
// pulling fmt's truncation flags everywhere.
func abbrev(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}