package chat import ( "bytes" "context" "encoding/json" "errors" "fmt" "io" "net/http" "strings" "time" ) // Ollama (local) provider — calls /api/chat on the local Ollama // server. No auth needed; default URL http://localhost:11434. // // Bare model names route here by default (registry.defaultName=ollama), // so "qwen3.5:latest" → ollama. Explicit "ollama/qwen3.5:latest" also // works (prefix stripped). type Ollama struct { baseURL string httpClient *http.Client } // NewOllama returns a local Ollama provider. baseURL defaults to // http://localhost:11434 when empty. timeout 0 → 180s. func NewOllama(baseURL string, timeout time.Duration) *Ollama { if baseURL == "" { baseURL = "http://localhost:11434" } if timeout == 0 { timeout = 180 * time.Second } return &Ollama{ baseURL: strings.TrimRight(baseURL, "/"), httpClient: &http.Client{Timeout: timeout}, } } func (o *Ollama) Name() string { return "ollama" } // Available pings /api/tags. Cached negative result would be a // premature optimization for G0 — Ollama is typically up. If down, // next call gets ErrUpstream which is the right signal anyway. func (o *Ollama) Available() bool { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() req, _ := http.NewRequestWithContext(ctx, "GET", o.baseURL+"/api/tags", nil) resp, err := o.httpClient.Do(req) if err != nil { return false } defer resp.Body.Close() return resp.StatusCode/100 == 2 } // Chat translates Request to Ollama's /api/chat shape and back. // Strips the optional "ollama/" prefix from req.Model. func (o *Ollama) Chat(ctx context.Context, req Request) (*Response, error) { model := StripPrefix(req.Model, "ollama") options := map[string]any{} // Pointer-valued temperature so "not set" (nil) doesn't overwrite // Ollama's default. Only forward when caller set it explicitly. if req.Temperature != nil { options["temperature"] = *req.Temperature } if req.MaxTokens > 0 { options["num_predict"] = req.MaxTokens } body := map[string]any{ "model": model, "messages": req.Messages, "stream": false, // Local hot path: skip reasoning by default. qwen3 / qwen3.5 are // thinking-capable but the inner-loop use case wants direct // answers, not reasoning traces. Without this, low max_tokens // budgets get consumed by thinking before any content is // produced. Cloud tier (Ollama Cloud) inherits the same default // — see ollama_cloud.go. "think": false, "options": options, } if req.Format == "json" { body["format"] = "json" } bs, _ := json.Marshal(body) httpReq, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/chat", bytes.NewReader(bs)) if err != nil { return nil, err } httpReq.Header.Set("Content-Type", "application/json") resp, err := o.httpClient.Do(httpReq) if err != nil { if errors.Is(ctx.Err(), context.DeadlineExceeded) { return nil, fmt.Errorf("%w: %s", ErrTimeout, "ollama") } return nil, fmt.Errorf("ollama: %w", err) } defer resp.Body.Close() rb, _ := io.ReadAll(resp.Body) if resp.StatusCode/100 != 2 { return nil, fmt.Errorf("%w: ollama %d: %s", ErrUpstream, resp.StatusCode, abbrev(string(rb), 200)) } var ollamaResp struct { Model string `json:"model"` Message struct { Content string `json:"content"` } `json:"message"` Done bool `json:"done"` DoneReason string `json:"done_reason"` // "stop", "length", ... PromptEvalCount int `json:"prompt_eval_count"` EvalCount int `json:"eval_count"` } if err := json.Unmarshal(rb, &ollamaResp); err != nil { return nil, fmt.Errorf("ollama decode: %w (body=%s)", err, abbrev(string(rb), 200)) } return &Response{ Model: model, Content: ollamaResp.Message.Content, InputTokens: ollamaResp.PromptEvalCount, OutputTokens: ollamaResp.EvalCount, FinishReason: finishReasonFromOllama(ollamaResp.Done, ollamaResp.DoneReason), }, nil } // finishReasonFromOllama prefers Ollama's done_reason when present // (newer Ollama 0.4+ exposes this on /api/chat). Falls back to the // done bool for older versions. Phase 4 scrum fix B-4 (Opus WARN): // previous logic mapped `done==true` → "stop" unconditionally, which // hid truncations that Ollama reports as `done=true, done_reason="length"`. // The playbook_lift judge needs this signal to detect when max_tokens // budget ran out before the answer completed. func finishReasonFromOllama(done bool, doneReason string) string { if doneReason != "" { return doneReason } if done { return "stop" } return "length" } // abbrev shortens long error bodies for log/error messages without // pulling fmt's truncation flags everywhere. func abbrev(s string, n int) string { if len(s) <= n { return s } return s[:n] + "…" }