/// Bucket operation error journal. /// /// Every bucket op failure and every rescue fallback lands here. Goal: /// answering "is anything broken?" with one HTTP call. /// /// Storage: batched write-once files under `_errors/bucket_errors/` in the /// primary bucket. Uses the shared `AppendLog` helper so we never rewrite /// existing files — see `append_log.rs` for the full pattern rationale. /// /// In-memory ring buffer holds the last N events for fast response; on /// startup `load_recent` hydrates it from all batch files. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::VecDeque; use std::sync::Arc; use tokio::sync::RwLock; use crate::append_log::{AppendLog, CompactStats}; const JOURNAL_PREFIX: &str = "_errors/bucket_errors"; const RING_CAPACITY: usize = 2000; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum BucketOp { Read, Write, Delete, List, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BucketErrorEvent { pub ts: DateTime, pub op: BucketOp, pub target: String, pub key: String, pub error: String, #[serde(default)] pub rescued: bool, } impl BucketErrorEvent { pub fn new_read(target: &str, key: &str, error: &str) -> Self { Self { ts: Utc::now(), op: BucketOp::Read, target: target.into(), key: key.into(), error: error.into(), rescued: false } } pub fn new_write(target: &str, key: &str, error: &str) -> Self { Self { ts: Utc::now(), op: BucketOp::Write, target: target.into(), key: key.into(), error: error.into(), rescued: false } } pub fn new_delete(target: &str, key: &str, error: &str) -> Self { Self { ts: Utc::now(), op: BucketOp::Delete, target: target.into(), key: key.into(), error: error.into(), rescued: false } } pub fn new_list(target: &str, prefix: &str, error: &str) -> Self { Self { ts: Utc::now(), op: BucketOp::List, target: target.into(), key: prefix.into(), error: error.into(), rescued: false } } } #[derive(Clone)] pub struct ErrorJournal { log: Arc, ring: Arc>>, } #[derive(Debug, Clone, Serialize)] pub struct HealthReport { pub period_minutes: i64, pub total_errors: usize, pub per_bucket: std::collections::HashMap, pub unhealthy_buckets: Vec, } impl ErrorJournal { pub fn new(store: Arc) -> Self { // Keep flush threshold lowish — operators checking /storage/errors // after a recent incident want to see fresh rows even without an // explicit flush. let log = Arc::new( AppendLog::new(store, JOURNAL_PREFIX).with_flush_threshold(8), ); Self { log, ring: Arc::new(RwLock::new(VecDeque::with_capacity(RING_CAPACITY))), } } /// Hydrate the ring buffer from existing batch files. Call once at /// startup. Tolerates malformed lines (skipped with a warning) and /// missing files (returns 0). pub async fn load_recent(&self) -> Result { let lines = self.log.read_all().await.unwrap_or_default(); let mut ring = self.ring.write().await; for line in lines { match serde_json::from_slice::(&line) { Ok(ev) => { if ring.len() >= RING_CAPACITY { ring.pop_front(); } ring.push_back(ev); } Err(e) => tracing::warn!("error journal: skip malformed line ({e})"), } } Ok(ring.len()) } /// Append an event. In-memory ring updated immediately; persistence /// happens in batches via the underlying AppendLog. pub async fn append(&self, event: BucketErrorEvent) { { let mut ring = self.ring.write().await; if ring.len() >= RING_CAPACITY { ring.pop_front(); } ring.push_back(event.clone()); } match serde_json::to_vec(&event) { Ok(line) => { if let Err(e) = self.log.append(line).await { tracing::error!("error journal persist failed: {e}"); } } Err(e) => tracing::error!("error journal serialize failed: {e}"), } } /// Mark the most recent matching in-memory event as rescued. /// Only updates the ring buffer — the JSONL line already persisted /// records the failure fact; rescue status travels in response headers. pub async fn mark_rescued_last(&self, target: &str, key: &str) { let mut ring = self.ring.write().await; for ev in ring.iter_mut().rev() { if matches!(ev.op, BucketOp::Read) && ev.target == target && ev.key == key && !ev.rescued { ev.rescued = true; break; } } } pub async fn recent(&self, limit: usize) -> Vec { let ring = self.ring.read().await; let start = ring.len().saturating_sub(limit); ring.iter().skip(start).cloned().collect() } pub async fn filter( &self, bucket: Option<&str>, since: Option>, limit: usize, ) -> Vec { let ring = self.ring.read().await; ring.iter() .rev() .filter(|ev| bucket.map_or(true, |b| ev.target == b)) .filter(|ev| since.map_or(true, |s| ev.ts >= s)) .take(limit) .cloned() .collect::>() .into_iter() .rev() .collect() } /// Summarize errors in the last N minutes. pub async fn health(&self, period_minutes: i64) -> HealthReport { use std::collections::HashMap; let cutoff = Utc::now() - chrono::Duration::minutes(period_minutes); let ring = self.ring.read().await; let recent: Vec<_> = ring.iter().filter(|ev| ev.ts >= cutoff).collect(); let mut per_bucket: HashMap = HashMap::new(); for ev in &recent { *per_bucket.entry(ev.target.clone()).or_insert(0) += 1; } let unhealthy_buckets: Vec = per_bucket .iter() .filter(|(_, c)| **c >= 3) .map(|(k, _)| k.clone()) .collect(); HealthReport { period_minutes, total_errors: recent.len(), per_bucket, unhealthy_buckets, } } /// Force an immediate flush of buffered events to object storage. pub async fn flush(&self) -> Result<(), String> { self.log.flush().await } /// Consolidate all batch files into one. Operator cleanup. pub async fn compact(&self) -> Result { self.log.compact().await } /// How many JSONL batch files currently exist. pub async fn file_count(&self) -> Result { self.log.file_count().await } }