from __future__ import annotations import os from datetime import datetime from typing import List, Dict, Any def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. For each chunk, create a metadata dictionary containing: - filename: basename of the provided file_path (or original_filename if provided) - upload_date: ISO 8601 timestamp of when metadata was generated - content_summary: first 200 characters of the chunk (or full chunk if shorter) - chunk_index: 0-based index of the chunk Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. original_filename: Override filename stored in metadata when file_path is a temp file. Returns: A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list. Raises: FileNotFoundError: If the provided file_path does not exist. """ # Edge case: no chunks to metadataize if not chunks: return [] # Validate file existence up-front to follow the edge-case requirements if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = [] for idx, chunk in enumerate(chunks): # Ensure we always have a string for summary extraction text = chunk if isinstance(chunk, str) else "" content_summary = text[:200] metadata.append( { "filename": filename, "upload_date": upload_date, "content_summary": content_summary, "chunk_index": idx, } ) return metadata