ROOT CAUSE: Python scripts polled status.get("processed", 0) but the
Rust Job struct serialized as "embedded_chunks". Scripts always saw 0,
looped forever printing "unknown: 0/50000" for 8+ hours.
Fix (both sides):
- Rust: added "processed" alias field + "total" field to Job struct,
kept in sync on every update_progress() and complete() call
- Python: fixed autonomous_agent.py and overnight_proof.sh to read
"embedded_chunks" as primary key
The actual embedding pipeline was working the whole time — 673K real
chunks embedded overnight. Only the monitoring was blind.
One-word bug, 8 hours of zombie output. This is why you test the
monitoring, not just the pipeline.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
122 lines
3.7 KiB
Rust
122 lines
3.7 KiB
Rust
/// Background job system for long-running embedding tasks.
|
|
/// POST /vectors/index returns a job_id immediately.
|
|
/// GET /vectors/jobs/{id} returns progress.
|
|
/// Embedding runs in background via tokio::spawn.
|
|
|
|
use serde::Serialize;
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
#[serde(rename_all = "lowercase")]
|
|
pub enum JobStatus {
|
|
Running,
|
|
Completed,
|
|
Failed,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct Job {
|
|
pub id: String,
|
|
pub status: JobStatus,
|
|
pub index_name: String,
|
|
pub total_chunks: usize,
|
|
/// How many chunks have been embedded so far.
|
|
/// Also serialized as "processed" for backward-compat with monitoring scripts.
|
|
pub embedded_chunks: usize,
|
|
#[serde(rename = "processed")]
|
|
pub processed_alias: usize,
|
|
pub total: usize,
|
|
pub progress_pct: f32,
|
|
pub storage_key: Option<String>,
|
|
pub error: Option<String>,
|
|
pub started_at: String,
|
|
pub completed_at: Option<String>,
|
|
pub chunks_per_sec: f32,
|
|
}
|
|
|
|
/// Shared progress tracker that background tasks update.
|
|
#[derive(Clone)]
|
|
pub struct JobTracker {
|
|
jobs: Arc<RwLock<HashMap<String, Job>>>,
|
|
}
|
|
|
|
impl JobTracker {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
jobs: Arc::new(RwLock::new(HashMap::new())),
|
|
}
|
|
}
|
|
|
|
/// Register a new job. Returns the job ID.
|
|
pub async fn create(&self, index_name: &str, total_chunks: usize) -> String {
|
|
let id = format!("job-{}", chrono::Utc::now().timestamp_millis());
|
|
let job = Job {
|
|
id: id.clone(),
|
|
status: JobStatus::Running,
|
|
index_name: index_name.to_string(),
|
|
total_chunks,
|
|
embedded_chunks: 0,
|
|
processed_alias: 0,
|
|
total: total_chunks,
|
|
progress_pct: 0.0,
|
|
storage_key: None,
|
|
error: None,
|
|
started_at: chrono::Utc::now().to_rfc3339(),
|
|
completed_at: None,
|
|
chunks_per_sec: 0.0,
|
|
};
|
|
self.jobs.write().await.insert(id.clone(), job);
|
|
id
|
|
}
|
|
|
|
/// Update progress.
|
|
pub async fn update_progress(&self, id: &str, embedded: usize, rate: f32) {
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(id) {
|
|
job.embedded_chunks = embedded;
|
|
job.processed_alias = embedded; // keep alias in sync
|
|
job.progress_pct = if job.total_chunks > 0 {
|
|
(embedded as f32 / job.total_chunks as f32) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
job.chunks_per_sec = rate;
|
|
}
|
|
}
|
|
|
|
/// Mark job as completed.
|
|
pub async fn complete(&self, id: &str, storage_key: String) {
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(id) {
|
|
job.status = JobStatus::Completed;
|
|
job.embedded_chunks = job.total_chunks;
|
|
job.processed_alias = job.total_chunks;
|
|
job.progress_pct = 100.0;
|
|
job.storage_key = Some(storage_key);
|
|
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
|
|
}
|
|
}
|
|
|
|
/// Mark job as failed.
|
|
pub async fn fail(&self, id: &str, error: String) {
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(id) {
|
|
job.status = JobStatus::Failed;
|
|
job.error = Some(error);
|
|
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
|
|
}
|
|
}
|
|
|
|
/// Get job status.
|
|
pub async fn get(&self, id: &str) -> Option<Job> {
|
|
self.jobs.read().await.get(id).cloned()
|
|
}
|
|
|
|
/// List all jobs.
|
|
pub async fn list(&self) -> Vec<Job> {
|
|
self.jobs.read().await.values().cloned().collect()
|
|
}
|
|
}
|