root 1565f536eb Fix: job tracker field name mismatch — the overnight killer
ROOT CAUSE: Python scripts polled status.get("processed", 0) but the
Rust Job struct serialized as "embedded_chunks". Scripts always saw 0,
looped forever printing "unknown: 0/50000" for 8+ hours.

Fix (both sides):
- Rust: added "processed" alias field + "total" field to Job struct,
  kept in sync on every update_progress() and complete() call
- Python: fixed autonomous_agent.py and overnight_proof.sh to read
  "embedded_chunks" as primary key

The actual embedding pipeline was working the whole time — 673K real
chunks embedded overnight. Only the monitoring was blind.

One-word bug, 8 hours of zombie output. This is why you test the
monitoring, not just the pipeline.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-17 10:41:32 -05:00

122 lines
3.7 KiB
Rust

/// Background job system for long-running embedding tasks.
/// POST /vectors/index returns a job_id immediately.
/// GET /vectors/jobs/{id} returns progress.
/// Embedding runs in background via tokio::spawn.
use serde::Serialize;
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum JobStatus {
Running,
Completed,
Failed,
}
#[derive(Debug, Clone, Serialize)]
pub struct Job {
pub id: String,
pub status: JobStatus,
pub index_name: String,
pub total_chunks: usize,
/// How many chunks have been embedded so far.
/// Also serialized as "processed" for backward-compat with monitoring scripts.
pub embedded_chunks: usize,
#[serde(rename = "processed")]
pub processed_alias: usize,
pub total: usize,
pub progress_pct: f32,
pub storage_key: Option<String>,
pub error: Option<String>,
pub started_at: String,
pub completed_at: Option<String>,
pub chunks_per_sec: f32,
}
/// Shared progress tracker that background tasks update.
#[derive(Clone)]
pub struct JobTracker {
jobs: Arc<RwLock<HashMap<String, Job>>>,
}
impl JobTracker {
pub fn new() -> Self {
Self {
jobs: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Register a new job. Returns the job ID.
pub async fn create(&self, index_name: &str, total_chunks: usize) -> String {
let id = format!("job-{}", chrono::Utc::now().timestamp_millis());
let job = Job {
id: id.clone(),
status: JobStatus::Running,
index_name: index_name.to_string(),
total_chunks,
embedded_chunks: 0,
processed_alias: 0,
total: total_chunks,
progress_pct: 0.0,
storage_key: None,
error: None,
started_at: chrono::Utc::now().to_rfc3339(),
completed_at: None,
chunks_per_sec: 0.0,
};
self.jobs.write().await.insert(id.clone(), job);
id
}
/// Update progress.
pub async fn update_progress(&self, id: &str, embedded: usize, rate: f32) {
let mut jobs = self.jobs.write().await;
if let Some(job) = jobs.get_mut(id) {
job.embedded_chunks = embedded;
job.processed_alias = embedded; // keep alias in sync
job.progress_pct = if job.total_chunks > 0 {
(embedded as f32 / job.total_chunks as f32) * 100.0
} else {
0.0
};
job.chunks_per_sec = rate;
}
}
/// Mark job as completed.
pub async fn complete(&self, id: &str, storage_key: String) {
let mut jobs = self.jobs.write().await;
if let Some(job) = jobs.get_mut(id) {
job.status = JobStatus::Completed;
job.embedded_chunks = job.total_chunks;
job.processed_alias = job.total_chunks;
job.progress_pct = 100.0;
job.storage_key = Some(storage_key);
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
}
}
/// Mark job as failed.
pub async fn fail(&self, id: &str, error: String) {
let mut jobs = self.jobs.write().await;
if let Some(job) = jobs.get_mut(id) {
job.status = JobStatus::Failed;
job.error = Some(error);
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
}
}
/// Get job status.
pub async fn get(&self, id: &str) -> Option<Job> {
self.jobs.read().await.get(id).cloned()
}
/// List all jobs.
pub async fn list(&self) -> Vec<Job> {
self.jobs.read().await.values().cloned().collect()
}
}