from __future__ import annotations import os from datetime import datetime from typing import List, Dict, Any, Optional def extract_metadata( file_path: str, chunks: List[str], original_filename: str | None = None, page_numbers: List[int | None] | None = None, chunk_file_paths: List[str | None] | None = None, document_id: str | None = None, ) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. Core fields (always present): - filename, upload_date, content_summary, chunk_index Optional fields (present when provided, None otherwise): - page_number: page source for the chunk (int or None for non-paginated docs) - chunk_file_path: path to the per-chunk source file - document_id: unique identifier linking all chunks to the same document Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. original_filename: Override filename stored in metadata when file_path is a temp file. page_numbers: Optional per-chunk page numbers. Length must match chunks. chunk_file_paths: Optional per-chunk source file paths. Length must match chunks. document_id: Optional unique document identifier applied to all chunks. Returns: A list of metadata dictionaries, one per chunk. Empty list if chunks is empty. Raises: FileNotFoundError: If file_path does not exist. ValueError: If page_numbers or chunk_file_paths length mismatches chunks. """ if not chunks: return [] if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") if page_numbers is not None and len(page_numbers) != len(chunks): raise ValueError( f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})" ) if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks): raise ValueError( f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})" ) filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = [] for idx, chunk in enumerate(chunks): text = chunk if isinstance(chunk, str) else "" content_summary = text[:200] entry: Dict[str, Any] = { "filename": filename, "upload_date": upload_date, "content_summary": content_summary, "chunk_index": idx, "document_id": document_id, } page_num = page_numbers[idx] if page_numbers else None if page_num is not None: entry["page_number"] = page_num cfp = chunk_file_paths[idx] if chunk_file_paths else None if cfp is not None: entry["chunk_file_path"] = cfp metadata.append(entry) return metadata