3-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder) on today's wave landed 4 real findings (2 BLOCK + 2 WARN) and 2 INFO touch-ups. Verbatim verdicts + disposition table at: reports/scrum/_evidence/2026-04-30/ B-1 (BLOCK Opus + INFO Kimi convergent) — ResolveKey API: collapse from 3-arg (envVar, envFileName, envFilePath) to 2-arg (envVar, envFilePath). Pre-fix every chatd caller passed the env var name twice; if operator renamed *_key_env in lakehouse.toml while keeping the canonical KEY= line in the .env file, fallback silently missed. B-2 (WARN Opus + WARN Kimi convergent) — handleProviders probe: drop the synthesize-then-Resolve probe; look up by name directly via Registry.Available(name). Prior probe synthesized "<name>/probe" model strings and routed through Resolve, fragile to any future routing rule (e.g. cloud-suffix special case). B-3 (BLOCK Opus single — verified by trace + end-to-end probe) — OllamaCloud.Chat StripPrefix used "cloud" but registry routes "ollama_cloud/<m>". Result: upstream got the prefixed model name and 400'd. Smoke missed it because chatd_smoke runs without ollama_cloud registered. Now strips the right prefix; new TestOllamaCloud_StripsCorrectPrefix locks both prefix + suffix cases. Verified live: ollama_cloud/deepseek-v3.2 round-trips cleanly through the real ollama.com endpoint. B-4 (WARN Opus single) — Ollama finishReason: read done_reason field instead of inferring from done bool alone. Newer Ollama reports done=true with done_reason="length" on truncation; the prior code mapped that to "stop" and lost the truncation signal the playbook_lift judge needs to retry. New TestFinishReasonFromOllama_PrefersDoneReason covers the fallback ladder. INFOs: - B-5: replace hand-rolled insertion sort in Registry.Names with sort.Strings (Opus called the "avoid sort import" comment a false economy — correct). - A-1: clarify the playbook_lift.sh comment around -judge "" arg passing (Opus noted the comment said "env priority" but didn't reflect that the empty arg also passes through the Go driver's resolution chain). False positives dismissed (3, documented in disposition.md): - Kimi: TestMaybeDowngrade_WithConfigList wrong assertion (test IS correct per design — model excluded from weak list = strong = downgrade) - Qwen: nil-deref claim (defensive code already handles nil) - Opus: qwen3.5:latest doesn't exist on Ollama hub (true on the public hub but local install has it) just verify: PASS. chatd_smoke 6/6 PASS. New regression tests: 3 (B-2, B-3, B-4 each get a focused test). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
165 lines
4.9 KiB
Go
165 lines
4.9 KiB
Go
package chat
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// OllamaCloud — Ollama Cloud (Pro plan as of 2026-04-28). Bearer
|
|
// auth via OLLAMA_CLOUD_KEY. Wire format mirrors the Rust adapter
|
|
// (crates/gateway/src/v1/ollama_cloud.rs) — system/prompt split
|
|
// against /api/generate with cloud-friendly num_predict floor.
|
|
//
|
|
// Model routing: "ollama_cloud/<model>" (prefix) and "<model>:cloud"
|
|
// (suffix) both route here. Registry.Resolve handles the suffix case;
|
|
// the prefix case strips "ollama_cloud/" before the upstream call so
|
|
// ollama.com sees the bare model name. Suffix-form models pass through
|
|
// StripPrefix unchanged (no leading prefix to strip — exactly what the
|
|
// upstream wants since "kimi-k2.6:cloud" is the canonical name on
|
|
// ollama.com).
|
|
type OllamaCloud struct {
|
|
apiKey string
|
|
baseURL string
|
|
httpClient *http.Client
|
|
}
|
|
|
|
func NewOllamaCloud(apiKey string, timeout time.Duration) *OllamaCloud {
|
|
if timeout == 0 {
|
|
timeout = 180 * time.Second
|
|
}
|
|
return &OllamaCloud{
|
|
apiKey: apiKey,
|
|
baseURL: "https://ollama.com",
|
|
httpClient: &http.Client{Timeout: timeout},
|
|
}
|
|
}
|
|
|
|
func (o *OllamaCloud) Name() string { return "ollama_cloud" }
|
|
func (o *OllamaCloud) Available() bool { return o.apiKey != "" }
|
|
|
|
func (o *OllamaCloud) Chat(ctx context.Context, req Request) (*Response, error) {
|
|
// Strip "ollama_cloud/" prefix (Phase 4 scrum fix B-3 — Opus BLOCK).
|
|
// Pre-fix used "cloud" which never matched, so upstream got the
|
|
// prefixed model name and 400'd. ":cloud" suffix models pass
|
|
// through unchanged — that's the canonical name on ollama.com.
|
|
model := StripPrefix(req.Model, "ollama_cloud")
|
|
|
|
system, prompt := flattenMessages(req.Messages)
|
|
|
|
body := map[string]any{
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": false,
|
|
// Mirror local Ollama — skip reasoning by default. Callers that
|
|
// need thinking explicitly (e.g. for Kimi K2 long reasoning)
|
|
// will get a future Request.Think field; the v0 default keeps
|
|
// outputs short and predictable.
|
|
"think": false,
|
|
"options": map[string]any{
|
|
// Cloud reasoning models need headroom — 400 floor matches
|
|
// the Rust adapter's policy for kimi-k2.6 / gpt-oss:120b.
|
|
"num_predict": maxInt(req.MaxTokens, 400),
|
|
// Cloud-tier default 0.3 when caller didn't set; honor an
|
|
// explicit value otherwise. Anthropic 4.7 deprecation
|
|
// doesn't apply here (Ollama Cloud is its own server),
|
|
// so always send.
|
|
"temperature": tempPtrOr(req.Temperature, 0.3),
|
|
},
|
|
}
|
|
if system != "" {
|
|
body["system"] = system
|
|
}
|
|
if req.Format == "json" {
|
|
body["format"] = "json"
|
|
}
|
|
|
|
bs, _ := json.Marshal(body)
|
|
httpReq, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/generate", bytes.NewReader(bs))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
httpReq.Header.Set("Authorization", "Bearer "+o.apiKey)
|
|
|
|
resp, err := o.httpClient.Do(httpReq)
|
|
if err != nil {
|
|
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
|
return nil, fmt.Errorf("%w: ollama_cloud", ErrTimeout)
|
|
}
|
|
return nil, fmt.Errorf("ollama_cloud: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode/100 != 2 {
|
|
return nil, fmt.Errorf("%w: ollama_cloud %d: %s", ErrUpstream, resp.StatusCode, abbrev(string(rb), 300))
|
|
}
|
|
|
|
var cloudResp struct {
|
|
Model string `json:"model"`
|
|
Response string `json:"response"`
|
|
Done bool `json:"done"`
|
|
DoneReason string `json:"done_reason"`
|
|
PromptEvalCount int `json:"prompt_eval_count"`
|
|
EvalCount int `json:"eval_count"`
|
|
}
|
|
if err := json.Unmarshal(rb, &cloudResp); err != nil {
|
|
return nil, fmt.Errorf("ollama_cloud decode: %w (body=%s)", err, abbrev(string(rb), 200))
|
|
}
|
|
|
|
return &Response{
|
|
Model: model,
|
|
Content: cloudResp.Response,
|
|
InputTokens: cloudResp.PromptEvalCount,
|
|
OutputTokens: cloudResp.EvalCount,
|
|
FinishReason: finishReasonFromOllama(cloudResp.Done, cloudResp.DoneReason),
|
|
}, nil
|
|
}
|
|
|
|
// flattenMessages splits a Message list into Ollama Cloud's
|
|
// /api/generate shape (single system + concatenated prompt).
|
|
// Mirrors the Rust adapter's flatten_messages_public.
|
|
func flattenMessages(messages []Message) (system, prompt string) {
|
|
var sysParts []string
|
|
var promptParts []string
|
|
for _, m := range messages {
|
|
switch m.Role {
|
|
case "system":
|
|
sysParts = append(sysParts, m.Content)
|
|
case "assistant":
|
|
promptParts = append(promptParts, "Assistant: "+m.Content)
|
|
default: // "user" or anything else
|
|
promptParts = append(promptParts, "User: "+m.Content)
|
|
}
|
|
}
|
|
return strings.Join(sysParts, "\n\n"), strings.Join(promptParts, "\n\n")
|
|
}
|
|
|
|
func maxInt(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func defaultFloat(v, fallback float64) float64 {
|
|
if v == 0 {
|
|
return fallback
|
|
}
|
|
return v
|
|
}
|
|
|
|
func tempPtrOr(v *float64, fallback float64) float64 {
|
|
if v == nil {
|
|
return fallback
|
|
}
|
|
return *v
|
|
}
|