"""Scene-specific validation — post-schema semantic checks for Layer 2 output.""" from dataclasses import dataclass from typing import Literal from difflib import SequenceMatcher from src.schemas.scene_array import SceneArray @dataclass class ValidationWarning: type: Literal[ "duplicate_scene_number", "similar_character_names", "scene_count_deviation", "empty_characters", "unknown_values", ] scene_number: int | None message: str def validate_scenes(scene_array: SceneArray, heading_count: int) -> list[ValidationWarning]: """Run semantic validation checks on extracted scenes. Args: scene_array: Validated SceneArray from Layer 2. heading_count: Number of scene_heading elements from Layer 1 output. Returns: List of warnings. Empty list means all checks passed. """ warnings: list[ValidationWarning] = [] # 1. Duplicate scene numbers (already caught by SceneArray validator, but double-check) seen_numbers: dict[int, int] = {} for scene in scene_array.scenes: if scene.scene_number in seen_numbers: warnings.append(ValidationWarning( type="duplicate_scene_number", scene_number=scene.scene_number, message=f"Scene number {scene.scene_number} appears more than once", )) seen_numbers[scene.scene_number] = seen_numbers.get(scene.scene_number, 0) + 1 # 2. Similar character names (possible duplicates) all_characters: set[str] = set() for scene in scene_array.scenes: all_characters.update(scene.characters_present) char_list = sorted(all_characters) for i, name_a in enumerate(char_list): for name_b in char_list[i + 1:]: ratio = SequenceMatcher(None, name_a.upper(), name_b.upper()).ratio() if ratio > 0.8 and name_a != name_b: warnings.append(ValidationWarning( type="similar_character_names", scene_number=None, message=f"Possible duplicate characters: '{name_a}' and '{name_b}' (similarity: {ratio:.0%})", )) # 3. Scene count deviation extracted_count = len(scene_array.scenes) if heading_count > 0: deviation = abs(extracted_count - heading_count) / heading_count if deviation > 0.20: warnings.append(ValidationWarning( type="scene_count_deviation", scene_number=None, message=f"Extracted {extracted_count} scenes but Layer 1 found {heading_count} scene headings (deviation: {deviation:.0%})", )) # 4. Empty characters_present for scene in scene_array.scenes: if not scene.characters_present: warnings.append(ValidationWarning( type="empty_characters", scene_number=scene.scene_number, message=f"Scene {scene.scene_number} has no characters listed", )) # 5. UNKNOWN values for scene in scene_array.scenes: scene_dict = scene.model_dump() for key, value in scene_dict.items(): if value == "UNKNOWN" or (isinstance(value, list) and "UNKNOWN" in value): warnings.append(ValidationWarning( type="unknown_values", scene_number=scene.scene_number, message=f"Scene {scene.scene_number} field '{key}' contains UNKNOWN", )) return warnings