legco_ai_assistant/backend/app/utils/metadata.py

from __future__ import annotations

import os
from datetime import datetime
from typing import List, Dict, Any, Optional


def extract_metadata(
    file_path: str,
    chunks: List[str],
    original_filename: str | None = None,
    page_numbers: List[int | None] | None = None,
    chunk_file_paths: List[str | None] | None = None,
    document_id: str | None = None,
    strategy_type: str = "token",
    chunk_metadata: List[Dict[str, Any]] | None = None,
) -> List[Dict[str, Any]]:
    """Extract metadata for a list of text chunks.

    Core fields (always present):
    - filename, upload_date, content_summary, chunk_index

    Optional fields (present when provided, None otherwise):
    - page_number: page source for the chunk (int or None for non-paginated docs)
    - chunk_file_path: path to the per-chunk source file
    - document_id: unique identifier linking all chunks to the same document

    Package 8 Q&A fields (present when chunk_metadata provided):
    - strategy_type, section_type, question_index, question_id, question_text,
      section_heading, answer_contains_table, source_page_range, parent_topic

    Args:
        file_path: Path to the file associated with the chunks.
        chunks: List of string chunks to generate metadata for.
        original_filename: Override filename stored in metadata when file_path
            is a temp file.
        page_numbers: Optional per-chunk page numbers. Length must match chunks.
        chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
        document_id: Optional unique document identifier applied to all chunks.
        strategy_type: Chunking strategy used ("token" or "question"). Stored in
            each chunk's metadata.
        chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
            Each dict is merged into the corresponding base metadata entry.
            Length must match chunks. Fields like question_id, question_index,
            section_type, etc. are forwarded to ChromaDB metadata.

    Returns:
        A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.

    Raises:
        FileNotFoundError: If file_path does not exist.
        ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
    """
    if not chunks:
        return []

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    if page_numbers is not None and len(page_numbers) != len(chunks):
        raise ValueError(
            f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
        )

    if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
        raise ValueError(
            f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
        )

    if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
        raise ValueError(
            f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
        )

    filename = original_filename if original_filename else os.path.basename(file_path)
    upload_date = datetime.now().isoformat()

    metadata: List[Dict[str, Any]] = []
    for idx, chunk in enumerate(chunks):
        text = chunk if isinstance(chunk, str) else ""
        content_summary = text[:200]
        entry: Dict[str, Any] = {
            "filename": filename,
            "upload_date": upload_date,
            "content_summary": content_summary,
            "chunk_index": idx,
            "document_id": document_id,
            "strategy_type": strategy_type,
        }
        page_num = page_numbers[idx] if page_numbers else None
        if page_num is not None:
            entry["page_number"] = page_num
        cfp = chunk_file_paths[idx] if chunk_file_paths else None
        if cfp is not None:
            entry["chunk_file_path"] = cfp
        if chunk_metadata:
            entry.update(chunk_metadata[idx])
        metadata.append(entry)

    return metadata