from __future__ import annotations import os from datetime import datetime from typing import List, Dict, Any, Optional def extract_metadata( file_path: str, chunks: List[str], original_filename: str | None = None, page_numbers: List[int | None] | None = None, chunk_file_paths: List[str | None] | None = None, document_id: str | None = None, strategy_type: str = "token", chunk_metadata: List[Dict[str, Any]] | None = None, ) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. Core fields (always present): - filename, upload_date, content_summary, chunk_index Optional fields (present when provided, None otherwise): - page_number: page source for the chunk (int or None for non-paginated docs) - chunk_file_path: path to the per-chunk source file - document_id: unique identifier linking all chunks to the same document Package 8 Q&A fields (present when chunk_metadata provided): - strategy_type, section_type, question_index, question_id, question_text, section_heading, answer_contains_table, source_page_range, parent_topic Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. original_filename: Override filename stored in metadata when file_path is a temp file. page_numbers: Optional per-chunk page numbers. Length must match chunks. chunk_file_paths: Optional per-chunk source file paths. Length must match chunks. document_id: Optional unique document identifier applied to all chunks. strategy_type: Chunking strategy used ("token" or "question"). Stored in each chunk's metadata. chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy. Each dict is merged into the corresponding base metadata entry. Length must match chunks. Fields like question_id, question_index, section_type, etc. are forwarded to ChromaDB metadata. Returns: A list of metadata dictionaries, one per chunk. Empty list if chunks is empty. Raises: FileNotFoundError: If file_path does not exist. ValueError: If page_numbers or chunk_file_paths length mismatches chunks. """ if not chunks: return [] if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") if page_numbers is not None and len(page_numbers) != len(chunks): raise ValueError( f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})" ) if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks): raise ValueError( f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})" ) if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks): raise ValueError( f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})" ) filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = [] for idx, chunk in enumerate(chunks): text = chunk if isinstance(chunk, str) else "" content_summary = text[:200] entry: Dict[str, Any] = { "filename": filename, "upload_date": upload_date, "content_summary": content_summary, "chunk_index": idx, "document_id": document_id, "strategy_type": strategy_type, } page_num = page_numbers[idx] if page_numbers else None if page_num is not None: entry["page_number"] = page_num cfp = chunk_file_paths[idx] if chunk_file_paths else None if cfp is not None: entry["chunk_file_path"] = cfp if chunk_metadata: entry.update(chunk_metadata[idx]) metadata.append(entry) return metadata