from __future__ import annotations import os from datetime import datetime from typing import List, Dict, Any def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. For each chunk, create a metadata dictionary containing: - filename: basename of the provided file_path - upload_date: ISO 8601 timestamp of when metadata was generated - content_summary: first 200 characters of the chunk (or full chunk if shorter) - chunk_index: 0-based index of the chunk Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. Returns: A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list. Raises: FileNotFoundError: If the provided file_path does not exist. """ # Edge case: no chunks to metadataize if not chunks: return [] # Validate file existence up-front to follow the edge-case requirements if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") filename = os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = [] for idx, chunk in enumerate(chunks): # Ensure we always have a string for summary extraction text = chunk if isinstance(chunk, str) else "" content_summary = text[:200] metadata.append( { "filename": filename, "upload_date": upload_date, "content_summary": content_summary, "chunk_index": idx, } ) return metadata