package replay import ( "bufio" "encoding/json" "os" "path/filepath" "regexp" "sort" "strings" ) // tokenize lowercases and splits on non-[a-z0-9_] runs, keeping tokens // of length ≥3. Matches replay.ts so retrieval scoring is consistent // across runtimes. func tokenize(text string) map[string]struct{} { out := map[string]struct{}{} if text == "" { return out } lower := strings.ToLower(text) var b strings.Builder flush := func() { if b.Len() >= 3 { out[b.String()] = struct{}{} } b.Reset() } for _, r := range lower { if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' { b.WriteRune(r) } else { flush() } } flush() return out } // jaccard returns |A ∩ B| / |A ∪ B| over token sets. func jaccard(a, b map[string]struct{}) float64 { if len(a) == 0 || len(b) == 0 { return 0 } inter := 0 for t := range a { if _, ok := b[t]; ok { inter++ } } union := len(a) + len(b) - inter if union == 0 { return 0 } return float64(inter) / float64(union) } // LoadRagCorpus reads `exports/rag/playbooks.jsonl` under root. // Returns empty slice when the file is missing — callers fall back to // a context-less prompt rather than failing. func LoadRagCorpus(root string) ([]RagSample, error) { path := filepath.Join(root, "exports", "rag", "playbooks.jsonl") f, err := os.Open(path) if err != nil { if os.IsNotExist(err) { return nil, nil } return nil, err } defer f.Close() var corpus []RagSample sc := bufio.NewScanner(f) sc.Buffer(make([]byte, 0, 1<<16), 1<<24) for sc.Scan() { line := sc.Bytes() if len(line) == 0 { continue } var rec RagSample if err := json.Unmarshal(line, &rec); err != nil { continue // malformed line — skip, matches TS behavior } corpus = append(corpus, rec) } return corpus, sc.Err() } // retrieveRag returns up to topK playbooks with non-zero overlap. // Sorted by score descending. Matches replay.ts. func retrieveRag(corpus []RagSample, task string, topK int) []RetrievedArtifact { taskTokens := tokenize(task) type scored struct { rec RagSample score float64 } all := make([]scored, 0, len(corpus)) for _, r := range corpus { text := r.Title + " " + r.Content + " " + strings.Join(r.Tags, " ") all = append(all, scored{rec: r, score: jaccard(taskTokens, tokenize(text))}) } sort.SliceStable(all, func(i, j int) bool { return all[i].score > all[j].score }) out := make([]RetrievedArtifact, 0, topK) for _, s := range all { if len(out) >= topK { break } if s.score <= 0 { break } out = append(out, RetrievedArtifact{ RagID: s.rec.ID, SourceRunID: s.rec.SourceRunID, Title: s.rec.Title, ContentPreview: trim(s.rec.Content, 240), SuccessScore: s.rec.SuccessScore, Tags: tagsOrEmpty(s.rec.Tags), Score: s.score, }) } return out } var validationLineRE = regexp.MustCompile(`(?i)^[-*]\s*(verify|check|assert|confirm|ensure)\b|^\s*(verify|check|assert|confirm|ensure)\s`) // extractValidationSteps pulls verify/check/assert/confirm/ensure // lines from accepted samples. Used as a soft-anchor in the // validation gate (response should touch at least one of these // tokens) and surfaced into the prompt. func extractValidationSteps(samples []RetrievedArtifact, corpus []RagSample) []string { ids := map[string]struct{}{} for _, s := range samples { ids[s.RagID] = struct{}{} } var steps []string for _, r := range corpus { if _, ok := ids[r.ID]; !ok { continue } for _, line := range strings.Split(r.Content, "\n") { t := strings.TrimSpace(line) if validationLineRE.MatchString(t) { steps = append(steps, trim(t, 200)) if len(steps) >= 6 { return steps } } } } return steps } // BuildContextBundle assembles a ContextBundle from a corpus + task. // Top 8 retrieved → split by success_score → at most 3 accepted, 2 // warnings → extract validation steps → estimate token cost. func BuildContextBundle(corpus []RagSample, task string) *ContextBundle { top := retrieveRag(corpus, task, 8) accepted := filterByScore(top, "accepted", 3) warnings := filterByScore(top, "partially_accepted", 2) steps := extractValidationSteps(accepted, corpus) totalChars := 0 for _, r := range accepted { totalChars += len(r.ContentPreview) + len(r.Title) } for _, r := range warnings { totalChars += len(r.ContentPreview) + len(r.Title) } for _, s := range steps { totalChars += len(s) } tokenEstimate := (totalChars + 3) / 4 // ceil(chars/4) return &ContextBundle{ RetrievedPlaybooks: top, PriorSuccessfulOutputs: accepted, FailurePatterns: warnings, ValidationSteps: stepsOrEmpty(steps), BundleTokenEstimate: tokenEstimate, } } func filterByScore(arts []RetrievedArtifact, score string, max int) []RetrievedArtifact { out := make([]RetrievedArtifact, 0, max) for _, a := range arts { if a.SuccessScore == score { out = append(out, a) if len(out) >= max { break } } } return out } func tagsOrEmpty(t []string) []string { if t == nil { return []string{} } return t } func stepsOrEmpty(s []string) []string { if s == nil { return []string{} } return s } func trim(s string, n int) string { if len(s) <= n { return s } return s[:n] }