Implements PROMPT.md / docs/REVIEW_PIPELINE.md Phase 2:
- internal/llm/ollama.go — real Ollama provider:
- HealthCheck probes /api/tags + a 1-token completion + a JSON-mode
probe ({"ok": true} round-trip), populating the model-doctor.json
schema documented in docs/LOCAL_MODEL_SETUP.md
- Complete + CompleteJSON via /api/chat with stream=false
- think=false set for ALL completions (qwen3.5:latest is reasoning-
capable but the inner-loop hot path wants direct answers, not
reasoning traces consuming the token budget — same finding as
the Lakehouse-Go chatd 2026-04-30 wave)
- internal/llm/review.go — Reviewer wrapper:
- 2-attempt flow: prompt → parse → repair-prompt → parse
- Strict JSON shape enforced; markdown fences stripped before parse
- Severity normalized to enum; out-of-range confidence clamped
- Per-file chunking (file-level for v0; function-level Phase D+)
- Bounded by review-profile max_file_bytes + max_llm_chunk_chars
- pipeline.go — Phase 2 wired between static scan + report gen:
- --enable-llm flag opts in (off by default — static-only is
cheaper and faster)
- Raw output ALWAYS saved to llm-findings.raw.json (forensics)
- Normalized findings → llm-findings.normalized.json
- LLM findings merged into the report findings list (sourced
"llm" so consumers can filter)
- Receipts honestly mark phase status: "ok" | "degraded" | "skipped"
- cli model doctor — real probes replace the Phase A stub.
Verified:
- model doctor: status="ok" with qwen3.5:latest + qwen3:latest both
loaded, basic_prompt_ok=true, json_mode_ok=true
- insecure-repo with --enable-llm: 9 LLM findings; qwen3.5 correctly
flagged SQLi, RCE, hardcoded credentials as critical with verbatim
evidence; 27s wall for 3 chunks
- clean-repo with --enable-llm: 0 LLM findings, 4 parsed chunks, 2.8s
- self-review with --enable-llm: 77 LLM findings + 83 static; 3 of
~30 chunks needed retry (PROMPT.md, REPORT_SCHEMA.md,
SCRUM_TEST_TEMPLATE.md — all eventually parsed); 5min wall
go vet + go test -short clean. Fixture stray.go now `package fixture`
so go-tooling doesn't choke on the orphan.
Phase D (validator cross-check) + Phase E (memory + diff/rules
subcommands) remain.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
296 lines
9.3 KiB
Go
296 lines
9.3 KiB
Go
// Phase 2 (LLM review) implementation. Sends bounded chunks of the
|
||
// repo to the local model, asks for strict JSON Findings, retries
|
||
// once on parse failure, marks the phase degraded if the second
|
||
// attempt also fails. Raw output is saved either way — operators
|
||
// can re-parse manually if the harness rejected something useful.
|
||
package llm
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
|
||
"local-review-harness/internal/analyzers"
|
||
"local-review-harness/internal/scanner"
|
||
)
|
||
|
||
// ReviewInput is one bounded review request. The harness chunks the
|
||
// scan result into ReviewInputs (one per file or one per file-group)
|
||
// before calling Review.
|
||
type ReviewInput struct {
|
||
ChunkID string // stable per-chunk identifier (file path for v0)
|
||
Description string // human label (e.g. "internal/foo/bar.go")
|
||
Content string // the actual code/content to review
|
||
Language string // for the prompt context
|
||
}
|
||
|
||
// ReviewOutput is what one Review call produces. RawContent is the
|
||
// model's verbatim output before parsing — saved for forensics if
|
||
// parsing fails.
|
||
type ReviewOutput struct {
|
||
ChunkID string `json:"chunk_id"`
|
||
Findings []analyzers.Finding `json:"findings"`
|
||
RawContent string `json:"raw_content"`
|
||
Parsed bool `json:"parsed"`
|
||
Retried bool `json:"retried"`
|
||
Error string `json:"error,omitempty"`
|
||
}
|
||
|
||
// Reviewer wraps a Provider with the prompt + retry logic. Stateless;
|
||
// the prompt template is baked in for v0.
|
||
type Reviewer struct {
|
||
prov Provider
|
||
model string
|
||
opts CompleteOptions
|
||
}
|
||
|
||
// NewReviewer constructs a Reviewer pointing at the configured
|
||
// primary model. opts are passed through to every Complete call;
|
||
// callers tune via review-profile.
|
||
func NewReviewer(prov Provider, model string, opts CompleteOptions) *Reviewer {
|
||
if opts.TimeoutSeconds == 0 {
|
||
opts.TimeoutSeconds = 120
|
||
}
|
||
return &Reviewer{prov: prov, model: model, opts: opts}
|
||
}
|
||
|
||
// Review runs the 2-attempt flow: prompt → parse → retry-with-repair-prompt → parse.
|
||
func (r *Reviewer) Review(ctx context.Context, in ReviewInput) ReviewOutput {
|
||
out := ReviewOutput{ChunkID: in.ChunkID}
|
||
|
||
// Attempt 1
|
||
prompt := buildReviewPrompt(in, false)
|
||
raw, err := r.prov.CompleteJSON(ctx, r.model, prompt, r.opts)
|
||
out.RawContent = raw
|
||
if err != nil {
|
||
out.Error = "request failed: " + err.Error()
|
||
return out
|
||
}
|
||
if findings, perr := parseFindings(raw, in); perr == nil {
|
||
out.Findings = findings
|
||
out.Parsed = true
|
||
return out
|
||
}
|
||
|
||
// Attempt 2 (repair prompt — feed the raw output back + ask for
|
||
// strict JSON only). Done once; second failure is degraded.
|
||
out.Retried = true
|
||
repair := buildRepairPrompt(in, raw)
|
||
raw2, err := r.prov.CompleteJSON(ctx, r.model, repair, r.opts)
|
||
out.RawContent = raw + "\n\n---repair---\n\n" + raw2
|
||
if err != nil {
|
||
out.Error = "repair request failed: " + err.Error()
|
||
return out
|
||
}
|
||
if findings, perr := parseFindings(raw2, in); perr == nil {
|
||
out.Findings = findings
|
||
out.Parsed = true
|
||
return out
|
||
} else {
|
||
out.Error = "parse failed after repair: " + perr.Error()
|
||
}
|
||
return out
|
||
}
|
||
|
||
// ReviewBatch runs Review over a slice of inputs sequentially. Could
|
||
// parallelize at G3+, but local Ollama is GPU-bound and serial is
|
||
// the safe v0 — burst-parallel would queue at the model server anyway.
|
||
func (r *Reviewer) ReviewBatch(ctx context.Context, inputs []ReviewInput) []ReviewOutput {
|
||
out := make([]ReviewOutput, 0, len(inputs))
|
||
for _, in := range inputs {
|
||
select {
|
||
case <-ctx.Done():
|
||
out = append(out, ReviewOutput{
|
||
ChunkID: in.ChunkID,
|
||
Error: "context cancelled before chunk processed",
|
||
})
|
||
continue
|
||
default:
|
||
}
|
||
out = append(out, r.Review(ctx, in))
|
||
}
|
||
return out
|
||
}
|
||
|
||
// === prompts ===
|
||
|
||
const reviewSystemPrompt = `You are a senior code reviewer auditing a single source file.
|
||
|
||
Your job: emit a JSON object with a "findings" array. Each finding
|
||
must include:
|
||
- title (string, < 80 chars)
|
||
- severity ("low" | "medium" | "high" | "critical")
|
||
- file (string, the file path you were asked to review — verbatim)
|
||
- line_hint (string, e.g. "42" or "100-110")
|
||
- evidence (string, a SHORT direct quote from the file — must
|
||
exist verbatim in the source so a downstream validator can
|
||
grep it)
|
||
- reason (string, one sentence explaining why this is a finding)
|
||
- suggested_fix (string, optional, one sentence)
|
||
- confidence (number 0.0–1.0)
|
||
|
||
Severity guidance:
|
||
- critical: credential leak, RCE risk, destructive command,
|
||
unauthenticated mutation
|
||
- high: SQL injection, broad CORS, fail-open auth, unsafe FS
|
||
- medium: hardcoded paths, weak error handling, missing tests
|
||
near important code
|
||
- low: naming, duplication, doc drift
|
||
|
||
Hard rules (failure = your output is rejected):
|
||
1. Output ONLY the JSON object. No prose before or after.
|
||
2. The evidence field MUST be a verbatim substring of the file.
|
||
If you can't quote the source, drop the finding.
|
||
3. Don't invent file paths, line numbers, or test names.
|
||
4. If the file is clean, return {"findings": []}.
|
||
5. Output nothing else when you're done.`
|
||
|
||
func buildReviewPrompt(in ReviewInput, _ bool) string {
|
||
var b strings.Builder
|
||
b.WriteString(reviewSystemPrompt)
|
||
b.WriteString("\n\n---\n\n")
|
||
b.WriteString("File path: ")
|
||
b.WriteString(in.Description)
|
||
b.WriteString("\nLanguage: ")
|
||
b.WriteString(in.Language)
|
||
b.WriteString("\n\nFile content:\n```\n")
|
||
b.WriteString(in.Content)
|
||
b.WriteString("\n```\n\nReturn JSON only.")
|
||
return b.String()
|
||
}
|
||
|
||
func buildRepairPrompt(in ReviewInput, prev string) string {
|
||
var b strings.Builder
|
||
b.WriteString("Your previous output was not valid JSON or did not match the required schema.\n\n")
|
||
b.WriteString("Required shape:\n")
|
||
b.WriteString(`{"findings":[{"title":"...","severity":"...","file":"...","line_hint":"...","evidence":"...","reason":"...","confidence":0.0}]}`)
|
||
b.WriteString("\n\nPrevious raw output (for your reference):\n")
|
||
b.WriteString(abbrev(prev, 1500))
|
||
b.WriteString("\n\nFor reference, the file you were reviewing was:\n")
|
||
b.WriteString(in.Description)
|
||
b.WriteString("\n\nReturn ONLY the JSON object now. No explanation, no markdown fences, no apology. JSON only.")
|
||
return b.String()
|
||
}
|
||
|
||
// === parsing ===
|
||
|
||
func parseFindings(raw string, in ReviewInput) ([]analyzers.Finding, error) {
|
||
// Strip leading/trailing whitespace + common markdown fences.
|
||
cleaned := strings.TrimSpace(raw)
|
||
cleaned = strings.TrimPrefix(cleaned, "```json")
|
||
cleaned = strings.TrimPrefix(cleaned, "```")
|
||
cleaned = strings.TrimSuffix(cleaned, "```")
|
||
cleaned = strings.TrimSpace(cleaned)
|
||
if cleaned == "" {
|
||
return nil, fmt.Errorf("empty content")
|
||
}
|
||
|
||
var shell struct {
|
||
Findings []struct {
|
||
Title string `json:"title"`
|
||
Severity string `json:"severity"`
|
||
File string `json:"file"`
|
||
LineHint string `json:"line_hint"`
|
||
Evidence string `json:"evidence"`
|
||
Reason string `json:"reason"`
|
||
SuggestedFix string `json:"suggested_fix"`
|
||
Confidence float64 `json:"confidence"`
|
||
} `json:"findings"`
|
||
}
|
||
if err := json.Unmarshal([]byte(cleaned), &shell); err != nil {
|
||
return nil, fmt.Errorf("unmarshal: %w", err)
|
||
}
|
||
|
||
out := make([]analyzers.Finding, 0, len(shell.Findings))
|
||
for _, f := range shell.Findings {
|
||
sev := normalizeSeverity(f.Severity)
|
||
if sev == "" {
|
||
continue // model emitted a value we don't accept
|
||
}
|
||
// Use the chunk's file path if model omitted/lied
|
||
filePath := f.File
|
||
if filePath == "" {
|
||
filePath = in.Description
|
||
}
|
||
out = append(out, analyzers.Finding{
|
||
Title: truncate(f.Title, 80),
|
||
Severity: sev,
|
||
Status: analyzers.StatusSuspected, // validator (Phase D) promotes to confirmed
|
||
File: filePath,
|
||
LineHint: f.LineHint,
|
||
Evidence: f.Evidence,
|
||
Reason: f.Reason,
|
||
SuggestedFix: f.SuggestedFix,
|
||
Source: analyzers.SourceLLM,
|
||
Confidence: clampFloat(f.Confidence, 0, 1),
|
||
CheckID: "llm.review",
|
||
})
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
func normalizeSeverity(s string) analyzers.Severity {
|
||
switch strings.ToLower(strings.TrimSpace(s)) {
|
||
case "low":
|
||
return analyzers.SeverityLow
|
||
case "medium", "med":
|
||
return analyzers.SeverityMedium
|
||
case "high":
|
||
return analyzers.SeverityHigh
|
||
case "critical", "crit":
|
||
return analyzers.SeverityCritical
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func truncate(s string, n int) string {
|
||
if len(s) <= n {
|
||
return s
|
||
}
|
||
return s[:n]
|
||
}
|
||
|
||
func clampFloat(v, lo, hi float64) float64 {
|
||
if v < lo {
|
||
return lo
|
||
}
|
||
if v > hi {
|
||
return hi
|
||
}
|
||
return v
|
||
}
|
||
|
||
// === chunking ===
|
||
|
||
// ChunkInputsFromScan produces one ReviewInput per file under the
|
||
// configured size limit. Files larger than maxBytes are skipped (the
|
||
// LLM phase notes them in the receipt as "skipped: too large"). v0
|
||
// is per-file; per-function chunking lands in Phase D+.
|
||
func ChunkInputsFromScan(scan *scanner.Result, maxBytes int, maxChunkChars int, readFile func(abs string) string) []ReviewInput {
|
||
out := []ReviewInput{}
|
||
for _, f := range scan.Files {
|
||
if f.Language == "" {
|
||
continue // non-code files: skip LLM review (analyzers may still flag)
|
||
}
|
||
if f.Size > int64(maxBytes) {
|
||
continue
|
||
}
|
||
content := readFile(f.Abs)
|
||
if len(content) > maxChunkChars {
|
||
content = content[:maxChunkChars] + "\n... (truncated for LLM context)\n"
|
||
}
|
||
out = append(out, ReviewInput{
|
||
ChunkID: f.Path,
|
||
Description: f.Path,
|
||
Content: content,
|
||
Language: f.Language,
|
||
})
|
||
}
|
||
return out
|
||
}
|
||
|
||
// Useful for callers wiring a deadline across the whole batch.
|
||
var _ = time.Now
|