Phase 1 implementation: script ingestion + AI extraction pipeline

Complete working pipeline from Fountain script to validated scene JSON:
- Schemas (Pydantic): all 7 layers defined upfront
- Fountain parser + normalizer (Layer 1)
- AI scene extractor with prompt contracts (Layer 2)
- Schema validator + scene-specific semantic validator
- Structured JSON logging per layer/scene execution
- Versioned output writer (never overwrites)
- Retry engine with 4-level failure escalation
- Stop condition evaluator (per-unit + global halts)
- Diff/drift detector for re-run comparison
- CLI entry point with --dry-run, --scene, --test, --force
- 3 test scripts (dialogue-heavy, action-heavy, nonstandard)
- Expected output files for regression testing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
profit 2026-04-06 15:49:43 -07:00
parent 2218e47c1f
commit 87d0af0748
42 changed files with 2418 additions and 0 deletions

1
.env.example Normal file
View File

@ -0,0 +1 @@
ANTHROPIC_API_KEY=your-api-key-here

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
output/
.env
__pycache__/
*.pyc
.venv/
venv/

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
anthropic>=0.40.0
pydantic>=2.0.0
python-dotenv>=1.0.0

190
run.py Normal file
View File

@ -0,0 +1,190 @@
"""CLI entry point for the AI Movie Production Pipeline."""
import argparse
import json
import os
import sys
from dotenv import load_dotenv
def main():
load_dotenv()
parser = argparse.ArgumentParser(description="AI Movie Production Pipeline — Phase 1")
parser.add_argument("--script", type=str, help="Path to .fountain script file")
parser.add_argument("--project", type=str, help="Project name (determines output directory)")
parser.add_argument("--model", type=str, default="claude-sonnet-4-20250514", help="Model ID")
parser.add_argument("--scene", type=int, default=None, help="Process only this scene number")
parser.add_argument("--dry-run", action="store_true", help="Validate inputs only, no AI calls")
parser.add_argument("--force", action="store_true", help="Ignore cache, re-run even if unchanged")
parser.add_argument("--test", action="store_true", help="Run test suite against test_scripts/")
parser.add_argument("--output-dir", type=str, default="output", help="Base output directory")
args = parser.parse_args()
if args.test:
run_tests(args.model, args.output_dir)
return
if not args.script or not args.project:
parser.error("--script and --project are required (unless using --test)")
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key and not args.dry_run:
print("ERROR: ANTHROPIC_API_KEY not set. Set it in .env or environment.")
sys.exit(1)
if not os.path.exists(args.script):
print(f"ERROR: Script file not found: {args.script}")
sys.exit(1)
from src.execution.runner import run_phase1
result = run_phase1(
script_path=args.script,
project_name=args.project,
api_key=api_key or "",
model=args.model,
output_dir=args.output_dir,
scene_filter=args.scene,
dry_run=args.dry_run,
force=args.force,
)
if not result.success:
print(f"\nPIPELINE FAILED: {result.stop_reason}")
sys.exit(1)
print("\nPIPELINE COMPLETE")
sys.exit(0)
def run_tests(model: str, output_dir: str):
"""Run test suite against all scripts in test_scripts/."""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("ERROR: ANTHROPIC_API_KEY required for tests")
sys.exit(1)
test_dir = os.path.join(os.path.dirname(__file__), "test_scripts")
expected_dir = os.path.join(test_dir, "expected")
scripts = [f for f in os.listdir(test_dir) if f.endswith(".fountain")]
if not scripts:
print("No test scripts found in test_scripts/")
sys.exit(1)
from src.execution.runner import run_phase1
all_passed = True
for script_file in sorted(scripts):
script_path = os.path.join(test_dir, script_file)
project_name = f"test_{os.path.splitext(script_file)[0]}"
expected_file = os.path.join(expected_dir, f"{os.path.splitext(script_file)[0]}_scenes.json")
print(f"\n{'='*60}")
print(f"TEST: {script_file}")
print(f"{'='*60}")
result = run_phase1(
script_path=script_path,
project_name=project_name,
api_key=api_key,
model=model,
output_dir=output_dir,
)
# Check against expected
if os.path.exists(expected_file):
with open(expected_file, "r", encoding="utf-8") as f:
expected = json.load(f)
passed = True
# Scene count check (within 20%)
expected_count = expected.get("expected_scene_count", 0)
if expected_count > 0:
deviation = abs(result.total_scenes - expected_count) / expected_count
if deviation > 0.20:
print(f" FAIL: Scene count {result.total_scenes} vs expected {expected_count} (deviation {deviation:.0%})")
passed = False
else:
print(f" PASS: Scene count {result.total_scenes} (expected {expected_count})")
# Character check — read actual output
actual_characters = _collect_characters(output_dir, project_name)
for char in expected.get("expected_characters", []):
if char.upper() not in {c.upper() for c in actual_characters}:
print(f" FAIL: Expected character '{char}' not found")
passed = False
else:
print(f" PASS: Character '{char}' found")
# Hallucination check
for char in expected.get("must_not_contain_characters", []):
if char.upper() in {c.upper() for c in actual_characters}:
print(f" FAIL: Hallucinated character '{char}' found")
passed = False
# Location check
actual_locations = _collect_locations(output_dir, project_name)
for loc in expected.get("expected_locations", []):
if loc.upper() not in {l.upper() for l in actual_locations}:
print(f" FAIL: Expected location '{loc}' not found")
passed = False
else:
print(f" PASS: Location '{loc}' found")
if passed:
print(f" RESULT: PASSED")
else:
print(f" RESULT: FAILED")
all_passed = False
else:
print(f" No expected output file — skipping regression checks")
if not result.success:
all_passed = False
print(f"\n{'='*60}")
if all_passed:
print("ALL TESTS PASSED")
sys.exit(0)
else:
print("SOME TESTS FAILED")
sys.exit(1)
def _collect_characters(output_dir: str, project_name: str) -> set[str]:
"""Collect all character names from L2 scene outputs."""
l2_dir = os.path.join(output_dir, project_name, "L2")
characters: set[str] = set()
if not os.path.exists(l2_dir):
return characters
for f in os.listdir(l2_dir):
if f.startswith("scene_") and f.endswith(".json") and f != "latest.json":
with open(os.path.join(l2_dir, f), "r", encoding="utf-8") as fh:
data = json.load(fh)
characters.update(data.get("characters_present", []))
return characters
def _collect_locations(output_dir: str, project_name: str) -> set[str]:
"""Collect all location names from L2 scene outputs."""
l2_dir = os.path.join(output_dir, project_name, "L2")
locations: set[str] = set()
if not os.path.exists(l2_dir):
return locations
for f in os.listdir(l2_dir):
if f.startswith("scene_") and f.endswith(".json") and f != "latest.json":
with open(os.path.join(l2_dir, f), "r", encoding="utf-8") as fh:
data = json.load(fh)
loc = data.get("location", "")
if loc:
locations.add(loc)
return locations
if __name__ == "__main__":
main()

0
src/__init__.py Normal file
View File

6
src/config.py Normal file
View File

@ -0,0 +1,6 @@
"""Project-level configuration defaults."""
DEFAULT_MODEL = "claude-sonnet-4-20250514"
DEFAULT_OUTPUT_DIR = "output"
DEFAULT_MAX_CHUNK_CHARS = 50000
DEFAULT_MAX_RETRIES_PER_LEVEL = 2

View File

View File

@ -0,0 +1,53 @@
"""Diff and drift detector — compares layer output versions."""
from dataclasses import dataclass, field
@dataclass
class DiffReport:
changed: bool
fields_changed: list[str] = field(default_factory=list)
drift_categories: list[str] = field(default_factory=list)
# Fields that map to drift categories
_DRIFT_MAP = {
"character_drift": {"characters_present", "new_characters_introduced"},
"location_drift": {"location", "int_ext"},
"continuity_drift": {"continuity_notes", "dependencies"},
"structure_drift": {"scene_number", "scene_heading"},
}
def diff_outputs(previous: dict, current: dict) -> DiffReport:
"""Compare two versions of a layer output.
Args:
previous: The previous version's data dict.
current: The new version's data dict.
Returns:
DiffReport with changed fields and drift categories.
"""
fields_changed: list[str] = []
drift_categories: set[str] = set()
all_keys = set(previous.keys()) | set(current.keys())
for key in all_keys:
prev_val = previous.get(key)
curr_val = current.get(key)
if prev_val != curr_val:
fields_changed.append(key)
# Categorize drift
for category, fields in _DRIFT_MAP.items():
if key in fields:
drift_categories.add(category)
return DiffReport(
changed=len(fields_changed) > 0,
fields_changed=fields_changed,
drift_categories=sorted(drift_categories),
)

View File

@ -0,0 +1,112 @@
"""Versioned output writer — writes layer outputs, never overwrites."""
import hashlib
import json
import os
from pydantic import BaseModel
class OutputWriter:
def __init__(self, project_name: str, output_dir: str = "output"):
self.project_name = project_name
self.output_dir = output_dir
def write(self, layer_id: str, scene_id: int | None, data: BaseModel) -> dict:
"""Write a layer output to a versioned JSON file.
Returns:
{"path": str, "version": int, "hash": str}
"""
layer_dir = os.path.join(self.output_dir, self.project_name, layer_id)
os.makedirs(layer_dir, exist_ok=True)
# Determine next version
prefix = f"scene_{scene_id:03d}" if scene_id is not None else "output"
version = self._next_version(layer_dir, prefix)
# Serialize
data_dict = data.model_dump()
data_json = json.dumps(data_dict, indent=2, ensure_ascii=False)
data_hash = hashlib.sha256(data_json.encode()).hexdigest()
# Write versioned file
filename = f"{prefix}_v{version}.json"
filepath = os.path.join(layer_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(data_json)
# Update latest manifest
self._update_latest(layer_dir, scene_id, version)
return {"path": filepath, "version": version, "hash": f"sha256:{data_hash}"}
def write_raw(self, layer_id: str, scene_id: int | None, data: dict) -> dict:
"""Write a raw dict (not a Pydantic model) to a versioned JSON file."""
layer_dir = os.path.join(self.output_dir, self.project_name, layer_id)
os.makedirs(layer_dir, exist_ok=True)
prefix = f"scene_{scene_id:03d}" if scene_id is not None else "output"
version = self._next_version(layer_dir, prefix)
data_json = json.dumps(data, indent=2, ensure_ascii=False)
data_hash = hashlib.sha256(data_json.encode()).hexdigest()
filename = f"{prefix}_v{version}.json"
filepath = os.path.join(layer_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(data_json)
self._update_latest(layer_dir, scene_id, version)
return {"path": filepath, "version": version, "hash": f"sha256:{data_hash}"}
def read_latest(self, layer_id: str, scene_id: int | None) -> dict | None:
"""Read the latest version of a layer output."""
layer_dir = os.path.join(self.output_dir, self.project_name, layer_id)
latest_path = os.path.join(layer_dir, "latest.json")
if not os.path.exists(latest_path):
return None
with open(latest_path, "r", encoding="utf-8") as f:
manifest = json.load(f)
key = str(scene_id) if scene_id is not None else "output"
version = manifest.get(key)
if version is None:
return None
return self.read_version(layer_id, scene_id, version)
def read_version(self, layer_id: str, scene_id: int | None, version: int) -> dict | None:
"""Read a specific version of a layer output."""
layer_dir = os.path.join(self.output_dir, self.project_name, layer_id)
prefix = f"scene_{scene_id:03d}" if scene_id is not None else "output"
filepath = os.path.join(layer_dir, f"{prefix}_v{version}.json")
if not os.path.exists(filepath):
return None
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
def _next_version(self, layer_dir: str, prefix: str) -> int:
"""Find the next available version number."""
version = 1
while os.path.exists(os.path.join(layer_dir, f"{prefix}_v{version}.json")):
version += 1
return version
def _update_latest(self, layer_dir: str, scene_id: int | None, version: int):
"""Update the latest.json manifest."""
latest_path = os.path.join(layer_dir, "latest.json")
manifest = {}
if os.path.exists(latest_path):
with open(latest_path, "r", encoding="utf-8") as f:
manifest = json.load(f)
key = str(scene_id) if scene_id is not None else "output"
manifest[key] = version
with open(latest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)

92
src/execution/retry.py Normal file
View File

@ -0,0 +1,92 @@
"""Retry and escalation engine — 4-level failure escalation."""
import time
from dataclasses import dataclass, field
from typing import Callable, Any
@dataclass
class FailureRecord:
scene_id: int | None
layer_id: str
escalation_level: int
error: str
attempts: list[dict] = field(default_factory=list)
def execute_with_retry(
fn: Callable,
input_data: Any,
layer_id: str,
scene_id: int | None = None,
fallback_fn: Callable | None = None,
max_attempts_per_level: int = 2,
) -> Any | FailureRecord:
"""Execute a function with 4-level escalation on failure.
Level 1: Retry same config (up to max_attempts_per_level)
Level 2: Retry same config again (placeholder for modified prompt in future)
Level 3: Call fallback_fn if provided
Level 4: Return FailureRecord
Args:
fn: The function to execute. Takes input_data as sole argument.
input_data: Passed to fn.
layer_id: For logging in FailureRecord.
scene_id: For logging in FailureRecord.
fallback_fn: Optional simplified extraction function for Level 3.
max_attempts_per_level: Max attempts at each escalation level.
Returns:
fn's return value on success, or FailureRecord if all levels exhausted.
"""
attempts: list[dict] = []
# Level 1: Retry same config
for attempt in range(max_attempts_per_level):
try:
result = fn(input_data)
return result
except Exception as e:
attempts.append({
"level": 1,
"attempt": attempt + 1,
"error": str(e),
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
})
# Level 2: Retry again (future: modified prompt)
for attempt in range(max_attempts_per_level):
try:
result = fn(input_data)
return result
except Exception as e:
attempts.append({
"level": 2,
"attempt": attempt + 1,
"error": str(e),
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
})
# Level 3: Fallback
if fallback_fn:
for attempt in range(max_attempts_per_level):
try:
result = fallback_fn(input_data)
return result
except Exception as e:
attempts.append({
"level": 3,
"attempt": attempt + 1,
"error": str(e),
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
})
# Level 4: Human intervention
return FailureRecord(
scene_id=scene_id,
layer_id=layer_id,
escalation_level=4,
error="All escalation levels exhausted",
attempts=attempts,
)

211
src/execution/runner.py Normal file
View File

@ -0,0 +1,211 @@
"""Pipeline runner — orchestrates Layer 1 → Layer 2 end-to-end."""
import hashlib
import json
import os
from dataclasses import dataclass, field
from src.ingestion.fountain_parser import parse_fountain
from src.ingestion.normalizer import normalize, NormalizationError
from src.understanding.extractor import extract_scenes, ExtractionError
from src.understanding.chunker import chunk_script
from src.schemas.scene import Scene
from src.schemas.scene_array import SceneArray
from src.validators.schema_validator import validate, ValidationResult
from src.validators.scene_validator import validate_scenes, ValidationWarning
from src.logging.layer_logger import LayerLogger
from src.execution.output_writer import OutputWriter
from src.execution.retry import execute_with_retry, FailureRecord
from src.execution.stop_conditions import evaluate_stop
from src.execution.diff_detector import diff_outputs
@dataclass
class PipelineResult:
success: bool
total_scenes: int = 0
valid_scenes: int = 0
flagged_scenes: int = 0
failed_scenes: int = 0
warnings: list[ValidationWarning] = field(default_factory=list)
stop_reason: str | None = None
def run_phase1(
script_path: str,
project_name: str,
api_key: str,
model: str = "claude-sonnet-4-20250514",
output_dir: str = "output",
scene_filter: int | None = None,
dry_run: bool = False,
force: bool = False,
) -> PipelineResult:
"""Run the Phase 1 pipeline: Layer 1 (ingestion) → Layer 2 (understanding).
Args:
script_path: Path to .fountain file.
project_name: Project name for output directory.
api_key: Anthropic API key.
model: Model ID.
output_dir: Base output directory.
scene_filter: If set, only process this scene number in Layer 2.
dry_run: If True, validate inputs only no AI calls.
force: If True, ignore cache and re-run even if input unchanged.
Returns:
PipelineResult with counts and any stop reason.
"""
logger = LayerLogger(project_name, output_dir)
writer = OutputWriter(project_name, output_dir)
# Resolve prompt contract path
prompts_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "src", "prompts")
contract_path = os.path.join(prompts_dir, "L2_scene_extraction_v1.json")
# ── LAYER 1: INGESTION ──────────────────────────────────────────────
print(f"[L1] Parsing: {script_path}")
run_id = logger.start("L1")
# Read file
with open(script_path, "r", encoding="utf-8") as f:
raw_text = f.read()
input_hash = f"sha256:{hashlib.sha256(raw_text.encode()).hexdigest()}"
# Parse
try:
elements = parse_fountain(raw_text)
normalized = normalize(elements)
except (NormalizationError, Exception) as e:
logger.finish(run_id, input_hash, None, "failed", failure_state=str(e))
print(f"[L1] STOP: {e}")
return PipelineResult(success=False, stop_reason=f"Layer 1 failed: {e}")
# Write L1 output
l1_result = writer.write("L1", None, normalized)
logger.finish(run_id, input_hash, l1_result["hash"], "valid")
heading_count = sum(1 for e in normalized.elements if e.type == "scene_heading")
print(f"[L1] Done: {len(normalized.elements)} elements, {heading_count} scene headings")
if dry_run:
print(f"[DRY RUN] Layer 1 valid. {heading_count} scenes would be extracted.")
return PipelineResult(success=True, total_scenes=heading_count)
# ── LAYER 2: UNDERSTANDING ──────────────────────────────────────────
print(f"[L2] Extracting scenes...")
# Chunk if needed
chunks = chunk_script(normalized)
print(f"[L2] Processing in {len(chunks)} chunk(s)")
all_results: list[ValidationResult | FailureRecord] = []
all_valid_scenes: list[Scene] = []
all_warnings: list[ValidationWarning] = []
total_token_usage = {"input": 0, "output": 0}
for chunk_idx, chunk in enumerate(chunks):
# Extract scenes from this chunk
def do_extract(data):
return extract_scenes(data, contract_path, api_key, model)
run_id = logger.start("L2", scene_id=chunk_idx)
chunk_input_hash = f"sha256:{hashlib.sha256(json.dumps([e.model_dump() for e in chunk.elements]).encode()).hexdigest()}"
extraction = execute_with_retry(
fn=do_extract,
input_data=chunk,
layer_id="L2",
scene_id=chunk_idx,
)
if isinstance(extraction, FailureRecord):
logger.finish(run_id, chunk_input_hash, None, "failed",
failure_state=extraction.error, retry_count=len(extraction.attempts))
all_results.append(extraction)
print(f"[L2] Chunk {chunk_idx + 1}: FAILED after {len(extraction.attempts)} attempts")
continue
# Validate each scene
total_token_usage["input"] += extraction.token_usage["input"]
total_token_usage["output"] += extraction.token_usage["output"]
for raw_scene in extraction.raw_scenes:
scene_num = raw_scene.get("scene_number", "?")
# Skip if scene_filter is set and doesn't match
if scene_filter is not None and scene_num != scene_filter:
continue
scene_run_id = logger.start("L2", scene_id=scene_num if isinstance(scene_num, int) else None)
scene_input_hash = f"sha256:{hashlib.sha256(json.dumps(raw_scene).encode()).hexdigest()}"
result = validate(raw_scene, Scene)
all_results.append(result)
if result.status == "failed":
logger.finish(scene_run_id, scene_input_hash, None, "failed",
failure_state="; ".join(result.errors))
print(f"[L2] Scene {scene_num}: FAILED validation — {result.errors}")
else:
# Write scene output
scene_out = writer.write_raw("L2", scene_num if isinstance(scene_num, int) else 0, raw_scene)
# Check for drift against previous version
if isinstance(scene_num, int):
prev = writer.read_version("L2", scene_num, scene_out["version"] - 1)
if prev:
diff = diff_outputs(prev, raw_scene)
if diff.changed:
print(f"[L2] Scene {scene_num}: DRIFT detected — {diff.drift_categories}")
logger.finish(scene_run_id, scene_input_hash, scene_out["hash"], result.status,
token_usage=extraction.token_usage)
if result.status == "flagged":
print(f"[L2] Scene {scene_num}: FLAGGED (contains UNKNOWN values)")
else:
print(f"[L2] Scene {scene_num}: valid")
all_valid_scenes.append(result.data)
logger.finish(run_id, chunk_input_hash, "chunk", "valid",
token_usage=extraction.token_usage)
# Run scene-level semantic validation
if all_valid_scenes:
scene_array = SceneArray(scenes=all_valid_scenes)
all_warnings = validate_scenes(scene_array, heading_count)
for w in all_warnings:
print(f"[L2] WARNING: {w.message}")
# Evaluate stop conditions
stop = evaluate_stop(all_results, heading_count)
if stop.should_stop:
print(f"[L2] STOP CONDITION: {stop.reason}")
return PipelineResult(
success=False,
total_scenes=heading_count,
valid_scenes=sum(1 for r in all_results if isinstance(r, ValidationResult) and r.status == "valid"),
flagged_scenes=sum(1 for r in all_results if isinstance(r, ValidationResult) and r.status == "flagged"),
failed_scenes=sum(1 for r in all_results if isinstance(r, FailureRecord) or (isinstance(r, ValidationResult) and r.status == "failed")),
warnings=all_warnings,
stop_reason=stop.reason,
)
valid = sum(1 for r in all_results if isinstance(r, ValidationResult) and r.status == "valid")
flagged = sum(1 for r in all_results if isinstance(r, ValidationResult) and r.status == "flagged")
failed = sum(1 for r in all_results if isinstance(r, FailureRecord) or (isinstance(r, ValidationResult) and r.status == "failed"))
print(f"\n[DONE] Scenes: {valid} valid, {flagged} flagged, {failed} failed")
print(f"[DONE] Tokens: {total_token_usage['input']} in / {total_token_usage['output']} out")
return PipelineResult(
success=True,
total_scenes=heading_count,
valid_scenes=valid,
flagged_scenes=flagged,
failed_scenes=failed,
warnings=all_warnings,
)

View File

@ -0,0 +1,59 @@
"""Stop condition evaluator — decides when the pipeline must halt."""
from dataclasses import dataclass
from typing import Literal
from src.validators.schema_validator import ValidationResult
from src.execution.retry import FailureRecord
@dataclass
class StopDecision:
should_stop: bool
reason: str | None
scope: Literal["unit", "global"] | None
def evaluate_stop(
results: list[ValidationResult | FailureRecord],
total_scenes: int,
) -> StopDecision:
"""Evaluate whether a stop condition has been triggered.
Args:
results: List of validation results and/or failure records from a layer run.
total_scenes: Total number of scenes that were expected to be processed.
Returns:
StopDecision indicating whether to halt and why.
"""
if total_scenes == 0:
return StopDecision(
should_stop=True,
reason="Zero scenes to process",
scope="global",
)
failed_count = 0
for result in results:
if isinstance(result, FailureRecord):
failed_count += 1
elif isinstance(result, ValidationResult) and result.status == "failed":
failed_count += 1
# Global stop: all scenes failed
if failed_count == total_scenes:
return StopDecision(
should_stop=True,
reason=f"All {total_scenes} scenes failed",
scope="global",
)
# Global stop: >30% failure rate
if total_scenes > 0 and failed_count / total_scenes > 0.30:
return StopDecision(
should_stop=True,
reason=f"{failed_count}/{total_scenes} scenes failed ({failed_count/total_scenes:.0%} > 30% threshold)",
scope="global",
)
return StopDecision(should_stop=False, reason=None, scope=None)

View File

View File

@ -0,0 +1,210 @@
"""Fountain format parser. Deterministic code — no AI.
Follows the Fountain spec: https://fountain.io/syntax
Converts raw .fountain text into a list of ScriptElement objects.
"""
import re
from src.schemas.script_element import ScriptElement
# Fountain scene heading prefixes
_SCENE_HEADING_PREFIXES = (
"INT ", "INT.", "EXT ", "EXT.", "INT/EXT", "INT./EXT", "I/E",
"int ", "int.", "ext ", "ext.", "int/ext", "int./ext", "i/e",
)
_TRANSITION_PATTERN = re.compile(r"^[A-Z\s]+TO:$")
_FORCED_SCENE_HEADING = re.compile(r"^\.[A-Z]")
_FORCED_TRANSITION = re.compile(r"^>(?!<)")
_CENTERED_TEXT = re.compile(r"^>.*<$")
_PARENTHETICAL = re.compile(r"^\(.*\)$")
def parse_fountain(text: str) -> list[ScriptElement]:
"""Parse Fountain-formatted text into ScriptElement list.
Args:
text: Raw contents of a .fountain file.
Returns:
Ordered list of ScriptElement objects with line numbers.
"""
lines = text.replace("\r\n", "\n").split("\n")
elements: list[ScriptElement] = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
line_num = i + 1 # 1-indexed
# Skip empty lines
if not stripped:
i += 1
continue
# Title page (key: value at start of document) — skip
if i < _find_first_content_line(lines) and ":" in stripped:
i += 1
continue
# Boneyard (/* ... */) — skip
if stripped.startswith("/*"):
while i < len(lines) and "*/" not in lines[i]:
i += 1
i += 1
continue
# Notes ([[ ... ]]) — skip
if stripped.startswith("[[") and stripped.endswith("]]"):
i += 1
continue
# Section headers (# ) — skip (metadata, not story)
if stripped.startswith("#"):
i += 1
continue
# Synopsis (= ) — skip (metadata)
if stripped.startswith("=") and not stripped.startswith("=="):
i += 1
continue
# Page break (===) — skip
if stripped.startswith("==="):
i += 1
continue
# Forced scene heading (.LOCATION)
if _FORCED_SCENE_HEADING.match(stripped):
elements.append(ScriptElement(
type="scene_heading",
text=stripped[1:].strip(), # remove leading dot
line_number=line_num,
))
i += 1
continue
# Standard scene heading
if stripped.upper().startswith(_SCENE_HEADING_PREFIXES):
elements.append(ScriptElement(
type="scene_heading",
text=stripped,
line_number=line_num,
))
i += 1
continue
# Forced transition (> TEXT)
if _FORCED_TRANSITION.match(stripped):
elements.append(ScriptElement(
type="transition",
text=stripped[1:].strip(),
line_number=line_num,
))
i += 1
continue
# Standard transition (SOMETHING TO:)
if _TRANSITION_PATTERN.match(stripped):
elements.append(ScriptElement(
type="transition",
text=stripped,
line_number=line_num,
))
i += 1
continue
# Centered text — treat as action
if _CENTERED_TEXT.match(stripped):
elements.append(ScriptElement(
type="action",
text=stripped[1:-1].strip(),
line_number=line_num,
))
i += 1
continue
# Character + Dialogue block
if _is_character_line(stripped, lines, i):
# Character name
char_name = stripped.rstrip("^").strip() # remove dual dialogue caret
if char_name.startswith("@"):
char_name = char_name[1:] # forced character
elements.append(ScriptElement(
type="character",
text=char_name,
line_number=line_num,
))
i += 1
# Consume parentheticals and dialogue that follow
while i < len(lines):
next_line = lines[i].strip()
if not next_line:
break
if _PARENTHETICAL.match(next_line):
elements.append(ScriptElement(
type="parenthetical",
text=next_line,
line_number=i + 1,
))
else:
elements.append(ScriptElement(
type="dialogue",
text=next_line,
line_number=i + 1,
))
i += 1
continue
# Default: action
elements.append(ScriptElement(
type="action",
text=stripped,
line_number=line_num,
))
i += 1
return elements
def _is_character_line(stripped: str, lines: list[str], index: int) -> bool:
"""Determine if a line is a character cue.
Fountain rules:
- All uppercase
- Followed by dialogue (non-empty next line after possible blank)
- May end with ^ (dual dialogue)
- May start with @ (forced character)
"""
if stripped.startswith("@"):
return True
# Must be uppercase (ignoring parenthetical extensions like (V.O.), (O.S.))
name_part = re.sub(r"\(.*?\)", "", stripped).strip().rstrip("^").strip()
if not name_part:
return False
if not name_part.replace(" ", "").replace(".", "").replace("'", "").replace("-", "").isalpha():
return False
if name_part != name_part.upper():
return False
# Must have a non-empty line following (the dialogue)
next_i = index + 1
if next_i < len(lines) and lines[next_i].strip():
return True
return False
def _find_first_content_line(lines: list[str]) -> int:
"""Find the first line that isn't part of the title page.
Title page ends at the first blank line.
"""
for i, line in enumerate(lines):
if not line.strip():
return i + 1
return 0

View File

@ -0,0 +1,29 @@
"""Normalizer — wraps parsed ScriptElements into a validated NormalizedScript."""
from src.schemas.script_element import ScriptElement
from src.schemas.normalized_script import NormalizedScript
class NormalizationError(Exception):
pass
def normalize(elements: list[ScriptElement]) -> NormalizedScript:
"""Validate and wrap ScriptElements into NormalizedScript.
Args:
elements: Output from fountain_parser.parse_fountain()
Returns:
Validated NormalizedScript.
Raises:
NormalizationError: If elements are empty or contain no scene headings.
"""
if not elements:
raise NormalizationError("No elements to normalize — input is empty")
try:
return NormalizedScript(elements=elements)
except ValueError as e:
raise NormalizationError(str(e)) from e

0
src/logging/__init__.py Normal file
View File

View File

@ -0,0 +1,63 @@
"""Structured JSON logger for pipeline layer executions."""
import json
import os
import time
import uuid
from src.schemas.layer_log import LayerLog, TokenUsage
class LayerLogger:
def __init__(self, project_name: str, output_dir: str = "output"):
self.project_name = project_name
self.log_dir = os.path.join(output_dir, project_name, "logs")
os.makedirs(self.log_dir, exist_ok=True)
self._runs: dict[str, dict] = {}
def start(self, layer_id: str, scene_id: int | None = None) -> str:
"""Begin tracking a layer execution. Returns run_id."""
run_id = uuid.uuid4().hex[:8]
self._runs[run_id] = {
"layer_id": layer_id,
"scene_id": scene_id,
"start_time": time.time(),
}
return run_id
def finish(
self,
run_id: str,
input_hash: str,
output_hash: str | None,
validation_result: str,
failure_state: str | None = None,
retry_count: int = 0,
token_usage: dict | None = None,
) -> LayerLog:
"""Finalize a layer execution and write log to disk."""
run = self._runs.pop(run_id)
execution_time_ms = int((time.time() - run["start_time"]) * 1000)
log = LayerLog(
layer_id=run["layer_id"],
scene_id=run["scene_id"],
run_id=run_id,
timestamp=time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
input_hash=input_hash,
output_hash=output_hash,
validation_result=validation_result,
execution_time_ms=execution_time_ms,
failure_state=failure_state,
retry_count=retry_count,
token_usage=TokenUsage(**token_usage) if token_usage else None,
)
# Write log file
scene_part = f"_scene_{run['scene_id']:03d}" if run["scene_id"] is not None else ""
filename = f"{run['layer_id']}{scene_part}_run_{run_id}.json"
filepath = os.path.join(self.log_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(log.model_dump(), f, indent=2)
return log

View File

@ -0,0 +1,17 @@
{
"contract_id": "L2_continuity_check_v1",
"layer": "L2",
"version": 1,
"purpose": "Verify continuity flags across extracted scenes",
"required_output_schema": "SceneArray",
"forbidden_behaviors": [
"Do not add new scenes",
"Do not remove existing scenes",
"Do not change scene_number or scene_heading values",
"Do not invent continuity relationships that are not supported by the script",
"Do not modify action_summary or dialogue_summary"
],
"system_prompt": "You are a continuity verification engine. You will receive a JSON array of extracted scenes from a screenplay.\n\nYour job is to review and correct ONLY the following fields:\n- continuity_notes: Ensure these accurately reflect state changes (injuries, wardrobe changes, time jumps, emotional shifts) that affect other scenes\n- dependencies: Ensure these accurately reference scene numbers that this scene continues from or references\n\nDo NOT modify any other fields. Return the full scenes array with corrections applied.\n\nReturn ONLY a JSON object with key \"scenes\" containing the corrected array. No additional text.",
"user_prompt_template": "Review and correct continuity fields in these extracted scenes:\n\n{{scenes_json}}",
"max_output_tokens": 8000
}

View File

@ -0,0 +1,19 @@
{
"contract_id": "L2_scene_extraction_v1",
"layer": "L2",
"version": 1,
"purpose": "Extract structured scene metadata from normalized screenplay text",
"required_output_schema": "SceneArray",
"forbidden_behaviors": [
"Do not invent characters not present in the script text",
"Do not invent locations not present in the script text",
"Do not invent props not described or implied in the script text",
"Do not merge distinct scenes into one",
"Do not split a single scene into multiple scenes",
"Do not summarize dialogue as action or vice versa",
"Do not leave any field empty — use UNKNOWN if the information cannot be determined"
],
"system_prompt": "You are a script analysis engine. Your job is to extract structured scene metadata from a screenplay.\n\nYou must return a JSON object with a single key \"scenes\" containing an array of scene objects.\n\nEach scene object MUST have ALL of the following fields (no exceptions):\n- scene_number (int): Sequential scene number starting from 1\n- scene_heading (string): The full scene heading line (e.g. \"INT. APARTMENT - NIGHT\")\n- location (string): The location name derived from the heading\n- time_of_day (string): One of: DAWN, MORNING, DAY, AFTERNOON, DUSK, NIGHT, UNKNOWN\n- int_ext (string): One of: INTERIOR, EXTERIOR, BOTH\n- characters_present (string[]): All characters present in the scene\n- new_characters_introduced (string[]): Characters appearing for the first time in the script\n- props (string[]): Significant objects mentioned or implied\n- wardrobe_clues (string[]): Described or implied clothing/appearance details\n- emotional_tone (string): The dominant mood of the scene\n- visual_beats (string[]): Key visual moments or images\n- action_summary (string): 2-3 sentence summary of scene action\n- dialogue_summary (string): 1-2 sentence summary of key dialogue\n- continuity_notes (string[]): State changes relevant to other scenes\n- dependencies (int[]): Scene numbers this scene references or continues from\n\nRULES:\n- Do not invent characters not present in the script text\n- Do not invent locations not present in the script text\n- Do not invent props not described or implied in the script text\n- Do not merge distinct scenes into one\n- Do not split a single scene into multiple scenes\n- Do not summarize dialogue as action or vice versa\n- Do not leave any field empty — use UNKNOWN if the information cannot be determined\n- For list fields where nothing applies, use an empty array []\n- Return ONLY the JSON object, no additional text",
"user_prompt_template": "Extract structured scene metadata from the following screenplay:\n\n{{script_text}}",
"max_output_tokens": 8000
}

View File

@ -0,0 +1,27 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"required": [
"contract_id",
"layer",
"version",
"purpose",
"required_output_schema",
"forbidden_behaviors",
"system_prompt",
"user_prompt_template",
"max_output_tokens"
],
"properties": {
"contract_id": { "type": "string" },
"layer": { "type": "string" },
"version": { "type": "integer", "minimum": 1 },
"purpose": { "type": "string" },
"required_output_schema": { "type": "string" },
"forbidden_behaviors": { "type": "array", "items": { "type": "string" } },
"system_prompt": { "type": "string" },
"user_prompt_template": { "type": "string" },
"max_output_tokens": { "type": "integer", "minimum": 1 }
},
"additionalProperties": false
}

0
src/schemas/__init__.py Normal file
View File

61
src/schemas/department.py Normal file
View File

@ -0,0 +1,61 @@
"""Department interpretation schemas — Layer 4. Built in Phase 3, defined now for contract stability."""
from pydantic import BaseModel
class DirectingOutput(BaseModel):
scene_objective: str
pacing_notes: str
key_dramatic_beats: list[str]
subtext_notes: str
class CinematographyOutput(BaseModel):
camera_style: str
lens_character: str
movement_patterns: list[str]
depth_of_field_intent: str
color_palette_direction: str
class LightingOutput(BaseModel):
key_light_direction: str
practical_sources: list[str]
mood_notes: str
time_of_day_requirements: str
contrast_ratio_intent: str
class ProductionDesignOutput(BaseModel):
required_set_elements: list[str]
set_dressing_priorities: list[str]
color_texture_palette: str
era_period_notes: str
spatial_blocking_requirements: str
class WardrobeOutput(BaseModel):
character_wardrobe: list[dict]
costume_condition: str
thematic_notes: str
changes_from_previous: str
class PerformanceOutput(BaseModel):
character_states: list[dict]
key_shifts: list[str]
subtext_notes: str
physical_behavior_cues: list[str]
class StoryboardOutput(BaseModel):
character_positions: list[str]
key_composition_frames: list[str]
spatial_relationships: list[str]
entry_exit_patterns: list[str]
class SceneDepartmentBreakdown(BaseModel):
scene: int
directing: DirectingOutput
cinematography: CinematographyOutput

21
src/schemas/layer_log.py Normal file
View File

@ -0,0 +1,21 @@
from typing import Literal, Optional
from pydantic import BaseModel
class TokenUsage(BaseModel):
input: int
output: int
class LayerLog(BaseModel):
layer_id: str
scene_id: Optional[int]
run_id: str
timestamp: str
input_hash: str
output_hash: Optional[str]
validation_result: Literal["valid", "failed", "flagged"]
execution_time_ms: int
failure_state: Optional[str]
retry_count: int
token_usage: Optional[TokenUsage]

View File

@ -0,0 +1,19 @@
from pydantic import BaseModel, model_validator
from src.schemas.script_element import ScriptElement
class NormalizedScript(BaseModel):
elements: list[ScriptElement]
@model_validator(mode="after")
def must_have_scene_heading(self):
headings = [e for e in self.elements if e.type == "scene_heading"]
if len(headings) == 0:
raise ValueError("Script contains zero scene headings — cannot proceed")
return self
@model_validator(mode="after")
def must_not_be_empty(self):
if len(self.elements) == 0:
raise ValueError("Script contains no elements")
return self

View File

@ -0,0 +1,82 @@
"""Production Bible schemas — Layer 3. Built in Phase 2, defined now for contract stability."""
from typing import Optional
from pydantic import BaseModel
class Relationship(BaseModel):
character: str
nature: str
class WardrobeState(BaseModel):
scene_range: list[int]
description: str
class EmotionalState(BaseModel):
scene: int
state: str
class Character(BaseModel):
name: str
aliases: list[str]
description: str
arc_summary: str
first_appearance: int
scenes_present: list[int]
relationships: list[Relationship]
wardrobe_states: list[WardrobeState]
emotional_arc: list[EmotionalState]
reference_prompt: str
class Location(BaseModel):
name: str
description: str
type: str
scenes_used: list[int]
time_of_day_variants: list[str]
notable_features: list[str]
mood_associations: list[str]
reference_prompt: str
class Prop(BaseModel):
name: str
description: str
significance: str
scenes_present: list[int]
owner_or_association: str
state_changes: list[EmotionalState] # reuses {scene, state} shape
class WardrobeEntry(BaseModel):
character: str
scene_range: list[int]
description: str
change_trigger: str
class EmotionalBeat(BaseModel):
scene: int
dominant_tone: str
tension_level: int
arc_position: str
class TimelineEntry(BaseModel):
scene: int
story_time: str
elapsed_since_previous: str
concurrent_with: list[int]
class ProductionBible(BaseModel):
characters: list[Character]
locations: list[Location]
props: list[Prop]
wardrobe: list[WardrobeEntry]
emotional_arc: list[EmotionalBeat]
timeline: list[TimelineEntry]

View File

@ -0,0 +1,12 @@
"""Prompt package schema — Layer 6. Built in Phase 5, defined now for contract stability."""
from typing import Optional
from pydantic import BaseModel
class PromptPackage(BaseModel):
category: str
scene: int
prompt: str
source_refs: list[str]
negative_prompt: Optional[str] = None

19
src/schemas/scene.py Normal file
View File

@ -0,0 +1,19 @@
from pydantic import BaseModel
class Scene(BaseModel):
scene_number: int
scene_heading: str
location: str
time_of_day: str
int_ext: str
characters_present: list[str]
new_characters_introduced: list[str]
props: list[str]
wardrobe_clues: list[str]
emotional_tone: str
visual_beats: list[str]
action_summary: str
dialogue_summary: str
continuity_notes: list[str]
dependencies: list[int]

View File

@ -0,0 +1,20 @@
from pydantic import BaseModel, model_validator
from src.schemas.scene import Scene
class SceneArray(BaseModel):
scenes: list[Scene]
@model_validator(mode="after")
def must_have_scenes(self):
if len(self.scenes) == 0:
raise ValueError("SceneArray contains no scenes")
return self
@model_validator(mode="after")
def unique_scene_numbers(self):
numbers = [s.scene_number for s in self.scenes]
dupes = [n for n in numbers if numbers.count(n) > 1]
if dupes:
raise ValueError(f"Duplicate scene numbers: {set(dupes)}")
return self

View File

@ -0,0 +1,18 @@
from typing import Literal, Optional
from pydantic import BaseModel
ELEMENT_TYPES = Literal[
"scene_heading",
"action",
"character",
"dialogue",
"parenthetical",
"transition",
]
class ScriptElement(BaseModel):
type: ELEMENT_TYPES
text: str
line_number: Optional[int] = None

24
src/schemas/shot.py Normal file
View File

@ -0,0 +1,24 @@
"""Shot schema — Layer 5. Built in Phase 4, defined now for contract stability."""
from typing import Optional
from pydantic import BaseModel
class Shot(BaseModel):
shot_id: str
scene: int
order: int
type: str
movement: str
subject: str
framing_notes: str
duration_intent: str
emotional_intent: str
lighting_notes: str
lens_notes: str
action_description: str
dialogue: Optional[str]
transition_in: str
transition_out: str
vfx_notes: Optional[str]
audio_notes: Optional[str]

View File

View File

@ -0,0 +1,63 @@
"""Script chunker — splits long scripts at scene boundaries."""
from src.schemas.normalized_script import NormalizedScript
from src.schemas.script_element import ScriptElement
def chunk_script(script: NormalizedScript, max_chars: int = 50000) -> list[NormalizedScript]:
"""Split a normalized script into chunks at scene boundaries.
Args:
script: Full normalized script.
max_chars: Maximum character count per chunk (approximate, based on text content).
Returns:
List of NormalizedScript chunks. Each contains only complete scenes.
"""
# Find scene boundary indices
scene_starts: list[int] = []
for i, elem in enumerate(script.elements):
if elem.type == "scene_heading":
scene_starts.append(i)
if not scene_starts:
return [script]
# Build scene groups (each group = elements from one scene heading to the next)
scene_groups: list[list[ScriptElement]] = []
for i, start in enumerate(scene_starts):
end = scene_starts[i + 1] if i + 1 < len(scene_starts) else len(script.elements)
scene_groups.append(script.elements[start:end])
# Build chunks by accumulating scenes until max_chars
chunks: list[NormalizedScript] = []
current_elements: list[ScriptElement] = []
current_chars = 0
for group in scene_groups:
group_chars = sum(len(e.text) for e in group)
# If a single scene exceeds max_chars, it becomes its own chunk
if group_chars > max_chars:
# Flush current chunk if non-empty
if current_elements:
chunks.append(NormalizedScript(elements=current_elements))
current_elements = []
current_chars = 0
chunks.append(NormalizedScript(elements=list(group)))
continue
# Would adding this scene exceed the limit?
if current_chars + group_chars > max_chars and current_elements:
chunks.append(NormalizedScript(elements=current_elements))
current_elements = []
current_chars = 0
current_elements.extend(group)
current_chars += group_chars
# Flush remaining
if current_elements:
chunks.append(NormalizedScript(elements=current_elements))
return chunks

View File

@ -0,0 +1,129 @@
"""Layer 2 AI extractor — sends normalized script to Claude, receives scene JSON."""
import json
from dataclasses import dataclass
from anthropic import Anthropic
from src.schemas.normalized_script import NormalizedScript
@dataclass
class ExtractionResult:
raw_scenes: list[dict]
token_usage: dict # {"input": int, "output": int}
class ExtractionError(Exception):
pass
def extract_scenes(
script: NormalizedScript,
contract_path: str,
api_key: str,
model: str = "claude-sonnet-4-20250514",
) -> ExtractionResult:
"""Extract structured scene data from a normalized script using AI.
Args:
script: Validated NormalizedScript from Layer 1.
contract_path: Path to the prompt contract JSON file.
api_key: Anthropic API key.
model: Model ID to use.
Returns:
ExtractionResult with raw scene dicts and token usage.
Raises:
ExtractionError: If AI response cannot be parsed as JSON.
"""
# Load prompt contract
with open(contract_path, "r", encoding="utf-8") as f:
contract = json.load(f)
# Build script text from elements
script_text = _elements_to_text(script)
# Render user prompt
user_prompt = contract["user_prompt_template"].replace("{{script_text}}", script_text)
# Call Claude API
client = Anthropic(api_key=api_key)
response = client.messages.create(
model=model,
max_tokens=contract["max_output_tokens"],
temperature=0,
system=contract["system_prompt"],
messages=[{"role": "user", "content": user_prompt}],
)
# Extract text content
response_text = response.content[0].text
# Parse JSON
try:
parsed = json.loads(response_text)
except json.JSONDecodeError as e:
# Try to extract JSON from response if wrapped in markdown
cleaned = _extract_json(response_text)
if cleaned:
try:
parsed = json.loads(cleaned)
except json.JSONDecodeError:
raise ExtractionError(f"AI response is not valid JSON: {e}") from e
else:
raise ExtractionError(f"AI response is not valid JSON: {e}") from e
# Extract scenes array
if isinstance(parsed, dict) and "scenes" in parsed:
scenes = parsed["scenes"]
elif isinstance(parsed, list):
scenes = parsed
else:
raise ExtractionError(f"Unexpected response structure: expected dict with 'scenes' key or list, got {type(parsed)}")
if not isinstance(scenes, list):
raise ExtractionError(f"'scenes' is not a list: {type(scenes)}")
token_usage = {
"input": response.usage.input_tokens,
"output": response.usage.output_tokens,
}
return ExtractionResult(raw_scenes=scenes, token_usage=token_usage)
def _elements_to_text(script: NormalizedScript) -> str:
"""Convert NormalizedScript back to readable text for the AI prompt."""
lines = []
for elem in script.elements:
if elem.type == "scene_heading":
lines.append("")
lines.append(elem.text)
lines.append("")
elif elem.type == "character":
lines.append("")
lines.append(f" {elem.text}")
elif elem.type == "dialogue":
lines.append(f" {elem.text}")
elif elem.type == "parenthetical":
lines.append(f" {elem.text}")
elif elem.type == "transition":
lines.append("")
lines.append(f" {elem.text}")
lines.append("")
else: # action
lines.append(elem.text)
return "\n".join(lines)
def _extract_json(text: str) -> str | None:
"""Try to extract JSON from text that may be wrapped in markdown code blocks."""
if "```json" in text:
start = text.index("```json") + 7
end = text.index("```", start)
return text[start:end].strip()
if "```" in text:
start = text.index("```") + 3
end = text.index("```", start)
return text[start:end].strip()
return None

View File

View File

@ -0,0 +1,92 @@
"""Scene-specific validation — post-schema semantic checks for Layer 2 output."""
from dataclasses import dataclass
from typing import Literal
from difflib import SequenceMatcher
from src.schemas.scene_array import SceneArray
@dataclass
class ValidationWarning:
type: Literal[
"duplicate_scene_number",
"similar_character_names",
"scene_count_deviation",
"empty_characters",
"unknown_values",
]
scene_number: int | None
message: str
def validate_scenes(scene_array: SceneArray, heading_count: int) -> list[ValidationWarning]:
"""Run semantic validation checks on extracted scenes.
Args:
scene_array: Validated SceneArray from Layer 2.
heading_count: Number of scene_heading elements from Layer 1 output.
Returns:
List of warnings. Empty list means all checks passed.
"""
warnings: list[ValidationWarning] = []
# 1. Duplicate scene numbers (already caught by SceneArray validator, but double-check)
seen_numbers: dict[int, int] = {}
for scene in scene_array.scenes:
if scene.scene_number in seen_numbers:
warnings.append(ValidationWarning(
type="duplicate_scene_number",
scene_number=scene.scene_number,
message=f"Scene number {scene.scene_number} appears more than once",
))
seen_numbers[scene.scene_number] = seen_numbers.get(scene.scene_number, 0) + 1
# 2. Similar character names (possible duplicates)
all_characters: set[str] = set()
for scene in scene_array.scenes:
all_characters.update(scene.characters_present)
char_list = sorted(all_characters)
for i, name_a in enumerate(char_list):
for name_b in char_list[i + 1:]:
ratio = SequenceMatcher(None, name_a.upper(), name_b.upper()).ratio()
if ratio > 0.8 and name_a != name_b:
warnings.append(ValidationWarning(
type="similar_character_names",
scene_number=None,
message=f"Possible duplicate characters: '{name_a}' and '{name_b}' (similarity: {ratio:.0%})",
))
# 3. Scene count deviation
extracted_count = len(scene_array.scenes)
if heading_count > 0:
deviation = abs(extracted_count - heading_count) / heading_count
if deviation > 0.20:
warnings.append(ValidationWarning(
type="scene_count_deviation",
scene_number=None,
message=f"Extracted {extracted_count} scenes but Layer 1 found {heading_count} scene headings (deviation: {deviation:.0%})",
))
# 4. Empty characters_present
for scene in scene_array.scenes:
if not scene.characters_present:
warnings.append(ValidationWarning(
type="empty_characters",
scene_number=scene.scene_number,
message=f"Scene {scene.scene_number} has no characters listed",
))
# 5. UNKNOWN values
for scene in scene_array.scenes:
scene_dict = scene.model_dump()
for key, value in scene_dict.items():
if value == "UNKNOWN" or (isinstance(value, list) and "UNKNOWN" in value):
warnings.append(ValidationWarning(
type="unknown_values",
scene_number=scene.scene_number,
message=f"Scene {scene.scene_number} field '{key}' contains UNKNOWN",
))
return warnings

View File

@ -0,0 +1,48 @@
"""Generic schema validator — validates raw dicts against Pydantic models."""
from dataclasses import dataclass, field
from typing import Literal, Optional, Any
from pydantic import BaseModel, ValidationError
@dataclass
class ValidationResult:
status: Literal["valid", "failed", "flagged"]
errors: list[str] = field(default_factory=list)
data: Optional[Any] = None # populated with the Pydantic model instance if valid/flagged
def validate(raw: dict, schema_class: type[BaseModel]) -> ValidationResult:
"""Validate a raw dict against a Pydantic model class.
Returns:
ValidationResult with status:
- "valid": all fields present, correct types, no UNKNOWN values
- "flagged": structurally valid but contains UNKNOWN values
- "failed": schema validation error (missing fields, wrong types)
"""
try:
instance = schema_class.model_validate(raw)
except ValidationError as e:
error_messages = []
for err in e.errors():
loc = " -> ".join(str(x) for x in err["loc"])
error_messages.append(f"{loc}: {err['msg']}")
return ValidationResult(status="failed", errors=error_messages, data=None)
# Check for UNKNOWN values — structurally valid but semantically incomplete
if _contains_unknown(raw):
return ValidationResult(status="flagged", errors=["Contains UNKNOWN values"], data=instance)
return ValidationResult(status="valid", errors=[], data=instance)
def _contains_unknown(obj: Any) -> bool:
"""Recursively check if any value in the structure is the string 'UNKNOWN'."""
if isinstance(obj, str):
return obj == "UNKNOWN"
if isinstance(obj, dict):
return any(_contains_unknown(v) for v in obj.values())
if isinstance(obj, list):
return any(_contains_unknown(item) for item in obj)
return False

View File

@ -0,0 +1,238 @@
Title: The Extraction
Credit: written by
Author: Test Script
Draft date: 2026-04-06
====
EXT. ROOFTOP - MEXICO CITY - NIGHT
A sprawling cityscape glitters below. JACK REEVES (40s, ex-military, scar across his jaw) crouches behind an air conditioning unit, binoculars pressed to his eyes.
Through the binoculars: a fortified compound three blocks away. Armed guards patrol the perimeter.
Jack touches his earpiece.
JACK
I count twelve on the outside. How many inside?
DISPATCH (V.O.)
Thermal says another eight. Plus the package.
JACK
Twenty hostiles. One extraction target. Lovely.
He slings a tactical bag over his shoulder and moves to the roof's edge.
EXT. ALLEY - MEXICO CITY - NIGHT
Jack drops from a fire escape, landing silently. He presses against the wall as a patrol vehicle passes.
He checks his watch: 11:47 PM.
He moves through the shadows toward the compound.
EXT. COMPOUND PERIMETER - NIGHT
Jack reaches a drainage grate at the compound's east wall. He pulls a small cutting tool from his bag and works the bolts.
A guard's flashlight sweeps nearby. Jack freezes. The light passes.
He removes the grate and slides into the drain.
INT. DRAINAGE TUNNEL - NIGHT
Cramped, wet, dark. Jack crawls through the tunnel using a red-filtered headlamp. Rats scatter ahead of him.
He reaches a junction and checks a hand-drawn map.
JACK
(whispering)
Left fork, thirty meters, then up.
He crawls left.
INT. COMPOUND - BASEMENT - NIGHT
Jack pushes up through a floor grate into a dimly lit basement. Stacked crates everywhere. He draws his suppressed pistol and listens.
Footsteps above. Two sets, moving away.
He crosses the basement to a steel door. Locked. He pulls a bypass kit from his bag and works the lock. It clicks open in twelve seconds.
INT. COMPOUND - GROUND FLOOR CORRIDOR - NIGHT
Jack moves down a concrete corridor, back to the wall. Security camera ahead — he pulls a small device from his pocket and aims it. The camera's red light blinks off.
He rounds a corner and comes face to face with GUARD #1.
A brutal, silent fight. Jack disarms the guard, puts him in a chokehold. The guard goes limp. Jack drags him into a side room.
INT. COMPOUND - STAIRWELL - NIGHT
Jack climbs the stairs, pistol leading. He pauses at the second floor landing — voices in Spanish behind the door.
He waits. The voices move away. He opens the door slowly.
INT. COMPOUND - SECOND FLOOR - NIGHT
A hallway lined with doors. At the far end, GUARD #2 and GUARD #3 stand outside a reinforced door.
Jack checks his options. A ventilation shaft above him. He holsters the pistol, jumps, grabs the edge, and pulls himself up.
INT. VENTILATION SHAFT - NIGHT
Jack crawls above the hallway. Through a vent below, he can see the two guards.
He pulls two smoke canisters from his bag. Drops them through the vent.
The hallway fills with white smoke. Shouts from the guards. Jack drops through the vent, lands between them.
Two precise strikes. Both guards are down in four seconds.
INT. COMPOUND - HOLDING ROOM - NIGHT
Jack kicks through the reinforced door. Inside: a bare room with a single chair. DR. SOFIA NAVARRO (30s, bruised, defiant) is handcuffed to it.
SOFIA
About time.
Jack cuts the cuffs with bolt cutters.
JACK
Can you run?
SOFIA
Can you get us out of here?
An alarm blares. Red lights flood the corridor outside.
JACK
That answers that. Move.
He hands her a spare earpiece and they exit into the hallway.
INT. COMPOUND - SECOND FLOOR - NIGHT
They run. Jack fires twice behind them as guards pour from a doorway. Both shots hit.
JACK
Stairs — go left!
They hit the stairwell. Gunfire chases them, sparking off the metal railing.
EXT. COMPOUND - COURTYARD - NIGHT
They burst through a ground floor exit into an open courtyard. Floodlights snap on. Guards converge from two sides.
Jack pulls a flashbang from his vest. Throws it. BANG — blinding white light.
In the chaos, Jack grabs Sofia and they sprint for the east wall.
JACK
Over the wall. I'll boost you.
He cups his hands. Sofia steps up and scrambles over. Jack follows, bullets pocking the concrete around him.
EXT. STREET - MEXICO CITY - NIGHT
They drop onto a narrow street. A black SUV screeches around the corner — RAMOS (30s, driver, cool under fire) behind the wheel.
RAMOS
Get in!
They dive into the back. Ramos floors it.
INT. SUV - MOVING - NIGHT
Jack reloads. Sofia catches her breath.
SOFIA
Who sent you?
JACK
People who want you alive.
SOFIA
That narrows it down to almost nobody.
Rear window SHATTERS — pursuit vehicle behind them.
RAMOS
Hang on!
Ramos cuts hard left through a market street. Stalls explode as the SUV plows through. The pursuit vehicle follows.
Jack leans out the window and fires at the pursuit vehicle's tires. Third shot connects — the vehicle swerves and crashes into a storefront.
EXT. HIGHWAY - NIGHT
The SUV merges onto a highway. City lights fall behind them.
Silence in the car. Ramos checks the mirror — no pursuit.
RAMOS
Airstrip is forty minutes out.
Jack nods. He looks at Sofia.
JACK
You're going to have to tell them everything. The lab, the formula, all of it.
SOFIA
I know.
JACK
They won't be happy about what you found.
SOFIA
That's why they tried to kill me.
EXT. DESERT AIRSTRIP - NIGHT
A small prop plane waits on a dirt runway, engines running. The SUV skids to a stop beside it.
Jack, Sofia, and Ramos exit. PILOT waves them toward the plane.
JACK
(to Ramos)
Burn the car.
RAMOS
Already planned on it.
Jack and Sofia board the plane.
INT. PROP PLANE - NIGHT
They settle into seats. The plane taxis and lifts off. Through the window, they watch the SUV ignite below — Ramos standing clear, lighter in hand.
SOFIA
Where are we going?
JACK
Somewhere they can't reach you.
SOFIA
Does that place exist?
Jack doesn't answer. The plane climbs into darkness.
EXT. COMPOUND - NIGHT
The compound is in chaos. Guards shout, lights sweep the surrounding blocks. COLONEL VEGA (50s, cold, scarred) stands in the holding room, staring at the empty chair and cut handcuffs.
An AIDE approaches.
AIDE
They escaped east. Vehicle lost them on the highway.
VEGA
Find the pilot's name. Find the airstrip. Find the plane.
He picks up the cut handcuffs.
VEGA
And find whoever sent the soldier.
FADE OUT.

View File

@ -0,0 +1,267 @@
Title: The Weight of Words
Credit: written by
Author: Test Script
Draft date: 2026-04-06
====
INT. THERAPIST'S OFFICE - DAY
A small, warm room. Bookshelves line the walls. DR. ELENA VOSS (50s, calm, measured) sits across from MARCUS CHEN (30s, restless, avoiding eye contact).
DR. VOSS
How have you been sleeping?
MARCUS
(shifting in his seat)
Fine. Same as always.
DR. VOSS
Marcus, we've talked about this. "Fine" isn't —
MARCUS
It's not a feeling, I know. You've said that.
A long beat. Elena writes something in her notebook.
DR. VOSS
Your sister called me.
MARCUS
She had no right to do that.
DR. VOSS
She's worried about you. She said you haven't left the apartment in two weeks.
MARCUS
That's an exaggeration.
DR. VOSS
Is it?
Marcus finally looks at her. His eyes are red-rimmed.
INT. MARCUS'S APARTMENT - NIGHT
A cluttered studio apartment. Takeout containers on every surface. Marcus sits on the floor against the wall, phone in hand. He stares at a text from LILY CHEN: "Please call me back."
He sets the phone face-down.
INT. COFFEE SHOP - MORNING
Bright, busy. LILY CHEN (late 20s, sharp, put-together but tired) waits at a corner table. Marcus enters, looking like he hasn't slept.
LILY
You look terrible.
MARCUS
Thanks. Love you too.
He sits. An awkward silence.
LILY
Mom's been asking about you.
MARCUS
Tell her I'm fine.
LILY
I'm not going to lie to her, Marcus.
MARCUS
Then don't tell her anything.
LILY
That's the same thing and you know it.
MARCUS
(quiet)
How is she?
LILY
She's scared. Dad's getting worse and you won't even visit.
Marcus stares at his coffee.
INT. HOSPITAL ROOM - AFTERNOON
HENRY CHEN (60s) lies in a hospital bed, frail but alert. Marcus stands in the doorway, unable to step inside.
HENRY
Are you going to stand there all day or come in?
Marcus enters slowly. He sits in the chair beside the bed.
HENRY
Your sister tells me you've been hiding.
MARCUS
I haven't been hiding.
HENRY
(smiling weakly)
You're a terrible liar. Always were.
MARCUS
Dad —
HENRY
I'm dying, Marcus. Not dead yet. You can still talk to me.
Marcus's composure cracks. He puts his head in his hands.
INT. THERAPIST'S OFFICE - DAY
Back with Dr. Voss. Marcus is more open now, leaning forward.
MARCUS
He just said it. Like it was nothing. "I'm dying."
DR. VOSS
How did that make you feel?
MARCUS
Like I've been wasting time. Like every day I spent not going to see him was —
He stops.
DR. VOSS
Was what?
MARCUS
Selfish. It was selfish.
DR. VOSS
You were protecting yourself. That's not the same as selfishness.
MARCUS
Isn't it?
INT. MARCUS'S APARTMENT - NIGHT
Marcus packs a small bag. He picks up a framed photo from the shelf — a young Marcus and Henry at a baseball game, both laughing.
He puts the photo in the bag.
INT. HOSPITAL ROOM - NIGHT
Marcus enters with the bag. Henry is asleep. NURSE PATRICIA (40s, gentle) is checking his IV.
NURSE PATRICIA
He's been asking for you.
MARCUS
I'm here now.
He sits in the chair and takes his father's hand. Henry stirs.
HENRY
(half asleep)
Marcus?
MARCUS
I'm here, Dad. I'm not going anywhere.
Henry squeezes his hand weakly and drifts back to sleep. Marcus settles in, pulling his jacket around himself like a blanket.
INT. HOSPITAL CAFETERIA - MORNING
Marcus and Lily sit across from each other, both holding bad coffee.
LILY
You stayed all night?
MARCUS
Yeah.
LILY
(softening)
Thank you.
MARCUS
Don't thank me. I should have been here weeks ago.
LILY
You're here now. That's what matters.
A beat.
MARCUS
I'm going to take a leave from work. Stay until... however long.
LILY
(eyes welling up)
Okay.
MARCUS
I called Dr. Voss. She's going to do phone sessions.
LILY
That's good. That's really good, Marcus.
They sit in silence, something unspoken settling between them.
INT. HOSPITAL ROOM - AFTERNOON
Marcus reads aloud from a book. Henry listens, eyes closed but smiling.
HENRY
You always did have a good voice for reading.
MARCUS
Mom used to say I should have been an actor.
HENRY
Your mother was right about most things.
MARCUS
She was right about you too.
HENRY
(opening his eyes)
What did she say?
MARCUS
That you were the most stubborn man alive.
HENRY
(laughing, then coughing)
Was. Was the most stubborn.
They share a look — sad, warm, real.
INT. HOSPITAL HALLWAY - EVENING
Marcus leans against the wall outside Henry's room, on the phone.
MARCUS
(into phone)
I know I missed the deadline. I understand... No, I'm not asking for sympathy. I'm asking for two weeks... Thank you. I appreciate it.
He hangs up. Takes a breath. Goes back inside.
INT. THERAPIST'S OFFICE - DAY (PHONE SESSION)
Marcus sits in the hospital cafeteria, phone to his ear.
DR. VOSS (V.O.)
How does it feel to be there?
MARCUS
Terrifying. And also... right. Like I'm finally where I'm supposed to be.
DR. VOSS (V.O.)
That's a significant shift, Marcus.
MARCUS
I know. I just wish it hadn't taken this long.
DR. VOSS (V.O.)
The important thing isn't when you arrived. It's that you stayed.
Marcus looks through the cafeteria window toward the hallway leading to his father's room.
MARCUS
Yeah. I'm staying.
FADE OUT.

View File

@ -0,0 +1,7 @@
{
"expected_scene_count": 18,
"expected_characters": ["JACK", "SOFIA", "RAMOS", "VEGA"],
"expected_locations": ["ROOFTOP", "ALLEY", "COMPOUND", "DRAINAGE TUNNEL", "STAIRWELL", "SUV", "HIGHWAY", "AIRSTRIP", "PROP PLANE"],
"must_not_contain_characters": [],
"must_not_contain_locations": []
}

View File

@ -0,0 +1,7 @@
{
"expected_scene_count": 13,
"expected_characters": ["MARCUS", "DR. VOSS", "LILY", "HENRY", "NURSE PATRICIA"],
"expected_locations": ["THERAPIST'S OFFICE", "APARTMENT", "COFFEE SHOP", "HOSPITAL ROOM", "HOSPITAL CAFETERIA", "HOSPITAL HALLWAY"],
"must_not_contain_characters": [],
"must_not_contain_locations": []
}

View File

@ -0,0 +1,7 @@
{
"expected_scene_count": 11,
"expected_characters": ["NADIA", "DEREK", "TOMMY", "MRS. PETROV", "MOTHER"],
"expected_locations": ["APARTMENT", "GROCERY STORE", "PARKING LOT", "BUS STOP", "BUS", "GAS STATION", "FAMILY HOUSE", "MOTEL ROOM"],
"must_not_contain_characters": [],
"must_not_contain_locations": []
}

View File

@ -0,0 +1,186 @@
Title: Loose Threads
Author: Test Script
====
apartment - morning
NADIA wakes up on the couch. Still in yesterday's clothes. Her phone buzzes on the coffee table — 14 missed calls.
She ignores it and walks to the kitchen.
NADIA
(to herself)
Not today.
She opens the fridge. Empty except for a bottle of hot sauce.
INT GROCERY STORE DAY
Nadia pushes a cart through fluorescent aisles. She looks half-asleep. Her phone rings again. She silences it.
at the checkout, TOMMY (20s, works here, knows everybody) scans her items.
TOMMY
You look rough.
NADIA
Didn't sleep.
TOMMY
Your brother was in here earlier looking for you.
Nadia freezes.
NADIA
What did you tell him?
TOMMY
Nothing. I don't get involved.
EXT. PARKING LOT - DAY
Nadia loads groceries into her car. A beat-up sedan is parked three spaces away. DEREK (30s, her brother, intense) leans against it.
DEREK
We need to talk.
NADIA
No we don't.
DEREK
It's about the house.
NADIA
I said no.
She gets in her car and drives away. Derek watches her go.
nadia's apartment - night
Nadia sits at her kitchen table eating cereal. A KNOCK at the door. She doesn't move. Another KNOCK.
MRS. PETROV (O.S.)
Nadia, I know you're in there. I can hear the television.
Nadia mutes the TV and stays quiet.
MRS. PETROV (O.S.)
Your rent is late again, dear.
NADIA
(calling out)
I'll have it Friday!
MRS. PETROV (O.S.)
That's what you said last Friday.
Silence. Footsteps recede.
EXT. BUS STOP - EARLY MORNING
Nadia waits at a bus stop, duffel bag at her feet. The sun is barely up. Nobody else around.
Her phone rings. She looks at the screen — "DEREK." She answers.
NADIA
What.
DEREK (V.O.)
The lawyer says we have to both sign or we lose everything.
NADIA
Maybe we should lose everything.
DEREK (V.O.)
You don't mean that.
NADIA
Don't tell me what I mean.
She hangs up. The bus arrives. She boards.
INT - BUS - MORNING
Nadia sits in the back. PASSENGER (elderly woman) across the aisle knits something blue.
PASSENGER
Going far?
NADIA
Hopefully.
The bus pulls away.
EXT GAS STATION - AFTERNOON
The bus has stopped. Passengers mill around. Nadia buys a sandwich from a vending machine and sits on the curb.
Her phone has 3 new texts from Derek. She reads them:
"Please just call me"
"This isn't about us this is about mom's house"
"Nadia"
She types: "I need a few days." Sends it.
.FLASHBACK - INT. FAMILY HOUSE - KITCHEN - YEARS AGO
Young NADIA (12) and young DEREK (16) sit at the kitchen table. Their MOTHER sets plates in front of them.
MOTHER
Eat your dinner before it gets cold.
YOUNG DEREK
Mom, can I go to Tyler's after?
MOTHER
After you clean your room.
YOUNG NADIA
His room is disgusting.
YOUNG DEREK
Shut up.
MOTHER
Both of you. Eat.
A normal evening. Nothing special. Everything special.
INT. MOTEL ROOM - NIGHT
Cheap. Clean enough. Nadia drops her duffel on the bed and lies down without undressing.
She stares at the ceiling for a long time.
Then she calls Derek.
DEREK (V.O.)
Nadia?
NADIA
Tell the lawyer I'll sign.
DEREK (V.O.)
Really?
NADIA
But I want to go through the house first. Before they clear it.
DEREK (V.O.)
...okay. Yeah. We can do that.
NADIA
I'll be back Thursday.
DEREK (V.O.)
Okay. Drive safe. Or... bus safe. Whatever.
NADIA
(almost smiling)
Goodnight, Derek.
She hangs up. Closes her eyes.
> FADE OUT.