Claude (review-harness setup) e346b54e0f Phase C — local-Ollama LLM review wired end-to-end
Implements PROMPT.md / docs/REVIEW_PIPELINE.md Phase 2:
- internal/llm/ollama.go — real Ollama provider:
  - HealthCheck probes /api/tags + a 1-token completion + a JSON-mode
    probe ({"ok": true} round-trip), populating the model-doctor.json
    schema documented in docs/LOCAL_MODEL_SETUP.md
  - Complete + CompleteJSON via /api/chat with stream=false
  - think=false set for ALL completions (qwen3.5:latest is reasoning-
    capable but the inner-loop hot path wants direct answers, not
    reasoning traces consuming the token budget — same finding as
    the Lakehouse-Go chatd 2026-04-30 wave)
- internal/llm/review.go — Reviewer wrapper:
  - 2-attempt flow: prompt → parse → repair-prompt → parse
  - Strict JSON shape enforced; markdown fences stripped before parse
  - Severity normalized to enum; out-of-range confidence clamped
  - Per-file chunking (file-level for v0; function-level Phase D+)
  - Bounded by review-profile max_file_bytes + max_llm_chunk_chars
- pipeline.go — Phase 2 wired between static scan + report gen:
  - --enable-llm flag opts in (off by default — static-only is
    cheaper and faster)
  - Raw output ALWAYS saved to llm-findings.raw.json (forensics)
  - Normalized findings → llm-findings.normalized.json
  - LLM findings merged into the report findings list (sourced
    "llm" so consumers can filter)
  - Receipts honestly mark phase status: "ok" | "degraded" | "skipped"
- cli model doctor — real probes replace the Phase A stub.

Verified:
- model doctor: status="ok" with qwen3.5:latest + qwen3:latest both
  loaded, basic_prompt_ok=true, json_mode_ok=true
- insecure-repo with --enable-llm: 9 LLM findings; qwen3.5 correctly
  flagged SQLi, RCE, hardcoded credentials as critical with verbatim
  evidence; 27s wall for 3 chunks
- clean-repo with --enable-llm: 0 LLM findings, 4 parsed chunks, 2.8s
- self-review with --enable-llm: 77 LLM findings + 83 static; 3 of
  ~30 chunks needed retry (PROMPT.md, REPORT_SCHEMA.md,
  SCRUM_TEST_TEMPLATE.md — all eventually parsed); 5min wall

go vet + go test -short clean. Fixture stray.go now `package fixture`
so go-tooling doesn't choke on the orphan.

Phase D (validator cross-check) + Phase E (memory + diff/rules
subcommands) remain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 01:13:39 -05:00

296 lines
9.3 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Phase 2 (LLM review) implementation. Sends bounded chunks of the
// repo to the local model, asks for strict JSON Findings, retries
// once on parse failure, marks the phase degraded if the second
// attempt also fails. Raw output is saved either way — operators
// can re-parse manually if the harness rejected something useful.
package llm
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"local-review-harness/internal/analyzers"
"local-review-harness/internal/scanner"
)
// ReviewInput is one bounded review request. The harness chunks the
// scan result into ReviewInputs (one per file or one per file-group)
// before calling Review.
type ReviewInput struct {
ChunkID string // stable per-chunk identifier (file path for v0)
Description string // human label (e.g. "internal/foo/bar.go")
Content string // the actual code/content to review
Language string // for the prompt context
}
// ReviewOutput is what one Review call produces. RawContent is the
// model's verbatim output before parsing — saved for forensics if
// parsing fails.
type ReviewOutput struct {
ChunkID string `json:"chunk_id"`
Findings []analyzers.Finding `json:"findings"`
RawContent string `json:"raw_content"`
Parsed bool `json:"parsed"`
Retried bool `json:"retried"`
Error string `json:"error,omitempty"`
}
// Reviewer wraps a Provider with the prompt + retry logic. Stateless;
// the prompt template is baked in for v0.
type Reviewer struct {
prov Provider
model string
opts CompleteOptions
}
// NewReviewer constructs a Reviewer pointing at the configured
// primary model. opts are passed through to every Complete call;
// callers tune via review-profile.
func NewReviewer(prov Provider, model string, opts CompleteOptions) *Reviewer {
if opts.TimeoutSeconds == 0 {
opts.TimeoutSeconds = 120
}
return &Reviewer{prov: prov, model: model, opts: opts}
}
// Review runs the 2-attempt flow: prompt → parse → retry-with-repair-prompt → parse.
func (r *Reviewer) Review(ctx context.Context, in ReviewInput) ReviewOutput {
out := ReviewOutput{ChunkID: in.ChunkID}
// Attempt 1
prompt := buildReviewPrompt(in, false)
raw, err := r.prov.CompleteJSON(ctx, r.model, prompt, r.opts)
out.RawContent = raw
if err != nil {
out.Error = "request failed: " + err.Error()
return out
}
if findings, perr := parseFindings(raw, in); perr == nil {
out.Findings = findings
out.Parsed = true
return out
}
// Attempt 2 (repair prompt — feed the raw output back + ask for
// strict JSON only). Done once; second failure is degraded.
out.Retried = true
repair := buildRepairPrompt(in, raw)
raw2, err := r.prov.CompleteJSON(ctx, r.model, repair, r.opts)
out.RawContent = raw + "\n\n---repair---\n\n" + raw2
if err != nil {
out.Error = "repair request failed: " + err.Error()
return out
}
if findings, perr := parseFindings(raw2, in); perr == nil {
out.Findings = findings
out.Parsed = true
return out
} else {
out.Error = "parse failed after repair: " + perr.Error()
}
return out
}
// ReviewBatch runs Review over a slice of inputs sequentially. Could
// parallelize at G3+, but local Ollama is GPU-bound and serial is
// the safe v0 — burst-parallel would queue at the model server anyway.
func (r *Reviewer) ReviewBatch(ctx context.Context, inputs []ReviewInput) []ReviewOutput {
out := make([]ReviewOutput, 0, len(inputs))
for _, in := range inputs {
select {
case <-ctx.Done():
out = append(out, ReviewOutput{
ChunkID: in.ChunkID,
Error: "context cancelled before chunk processed",
})
continue
default:
}
out = append(out, r.Review(ctx, in))
}
return out
}
// === prompts ===
const reviewSystemPrompt = `You are a senior code reviewer auditing a single source file.
Your job: emit a JSON object with a "findings" array. Each finding
must include:
- title (string, < 80 chars)
- severity ("low" | "medium" | "high" | "critical")
- file (string, the file path you were asked to review — verbatim)
- line_hint (string, e.g. "42" or "100-110")
- evidence (string, a SHORT direct quote from the file — must
exist verbatim in the source so a downstream validator can
grep it)
- reason (string, one sentence explaining why this is a finding)
- suggested_fix (string, optional, one sentence)
- confidence (number 0.01.0)
Severity guidance:
- critical: credential leak, RCE risk, destructive command,
unauthenticated mutation
- high: SQL injection, broad CORS, fail-open auth, unsafe FS
- medium: hardcoded paths, weak error handling, missing tests
near important code
- low: naming, duplication, doc drift
Hard rules (failure = your output is rejected):
1. Output ONLY the JSON object. No prose before or after.
2. The evidence field MUST be a verbatim substring of the file.
If you can't quote the source, drop the finding.
3. Don't invent file paths, line numbers, or test names.
4. If the file is clean, return {"findings": []}.
5. Output nothing else when you're done.`
func buildReviewPrompt(in ReviewInput, _ bool) string {
var b strings.Builder
b.WriteString(reviewSystemPrompt)
b.WriteString("\n\n---\n\n")
b.WriteString("File path: ")
b.WriteString(in.Description)
b.WriteString("\nLanguage: ")
b.WriteString(in.Language)
b.WriteString("\n\nFile content:\n```\n")
b.WriteString(in.Content)
b.WriteString("\n```\n\nReturn JSON only.")
return b.String()
}
func buildRepairPrompt(in ReviewInput, prev string) string {
var b strings.Builder
b.WriteString("Your previous output was not valid JSON or did not match the required schema.\n\n")
b.WriteString("Required shape:\n")
b.WriteString(`{"findings":[{"title":"...","severity":"...","file":"...","line_hint":"...","evidence":"...","reason":"...","confidence":0.0}]}`)
b.WriteString("\n\nPrevious raw output (for your reference):\n")
b.WriteString(abbrev(prev, 1500))
b.WriteString("\n\nFor reference, the file you were reviewing was:\n")
b.WriteString(in.Description)
b.WriteString("\n\nReturn ONLY the JSON object now. No explanation, no markdown fences, no apology. JSON only.")
return b.String()
}
// === parsing ===
func parseFindings(raw string, in ReviewInput) ([]analyzers.Finding, error) {
// Strip leading/trailing whitespace + common markdown fences.
cleaned := strings.TrimSpace(raw)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return nil, fmt.Errorf("empty content")
}
var shell struct {
Findings []struct {
Title string `json:"title"`
Severity string `json:"severity"`
File string `json:"file"`
LineHint string `json:"line_hint"`
Evidence string `json:"evidence"`
Reason string `json:"reason"`
SuggestedFix string `json:"suggested_fix"`
Confidence float64 `json:"confidence"`
} `json:"findings"`
}
if err := json.Unmarshal([]byte(cleaned), &shell); err != nil {
return nil, fmt.Errorf("unmarshal: %w", err)
}
out := make([]analyzers.Finding, 0, len(shell.Findings))
for _, f := range shell.Findings {
sev := normalizeSeverity(f.Severity)
if sev == "" {
continue // model emitted a value we don't accept
}
// Use the chunk's file path if model omitted/lied
filePath := f.File
if filePath == "" {
filePath = in.Description
}
out = append(out, analyzers.Finding{
Title: truncate(f.Title, 80),
Severity: sev,
Status: analyzers.StatusSuspected, // validator (Phase D) promotes to confirmed
File: filePath,
LineHint: f.LineHint,
Evidence: f.Evidence,
Reason: f.Reason,
SuggestedFix: f.SuggestedFix,
Source: analyzers.SourceLLM,
Confidence: clampFloat(f.Confidence, 0, 1),
CheckID: "llm.review",
})
}
return out, nil
}
func normalizeSeverity(s string) analyzers.Severity {
switch strings.ToLower(strings.TrimSpace(s)) {
case "low":
return analyzers.SeverityLow
case "medium", "med":
return analyzers.SeverityMedium
case "high":
return analyzers.SeverityHigh
case "critical", "crit":
return analyzers.SeverityCritical
}
return ""
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n]
}
func clampFloat(v, lo, hi float64) float64 {
if v < lo {
return lo
}
if v > hi {
return hi
}
return v
}
// === chunking ===
// ChunkInputsFromScan produces one ReviewInput per file under the
// configured size limit. Files larger than maxBytes are skipped (the
// LLM phase notes them in the receipt as "skipped: too large"). v0
// is per-file; per-function chunking lands in Phase D+.
func ChunkInputsFromScan(scan *scanner.Result, maxBytes int, maxChunkChars int, readFile func(abs string) string) []ReviewInput {
out := []ReviewInput{}
for _, f := range scan.Files {
if f.Language == "" {
continue // non-code files: skip LLM review (analyzers may still flag)
}
if f.Size > int64(maxBytes) {
continue
}
content := readFile(f.Abs)
if len(content) > maxChunkChars {
content = content[:maxChunkChars] + "\n... (truncated for LLM context)\n"
}
out = append(out, ReviewInput{
ChunkID: f.Path,
Description: f.Path,
Content: content,
Language: f.Language,
})
}
return out
}
// Useful for callers wiring a deadline across the whole batch.
var _ = time.Now