legco_ai_assistant/backend/app/utils/metadata.py

from __future__ import annotations

import os
from datetime import datetime
from typing import List, Dict, Any, Optional


def extract_metadata(
    file_path: str,
    chunks: List[str],
    original_filename: str | None = None,
    page_numbers: List[int | None] | None = None,
    chunk_file_paths: List[str | None] | None = None,
    document_id: str | None = None,
) -> List[Dict[str, Any]]:
    """Extract metadata for a list of text chunks.

    Core fields (always present):
    - filename, upload_date, content_summary, chunk_index

    Optional fields (present when provided, None otherwise):
    - page_number: page source for the chunk (int or None for non-paginated docs)
    - chunk_file_path: path to the per-chunk source file
    - document_id: unique identifier linking all chunks to the same document

    Args:
        file_path: Path to the file associated with the chunks.
        chunks: List of string chunks to generate metadata for.
        original_filename: Override filename stored in metadata when file_path
            is a temp file.
        page_numbers: Optional per-chunk page numbers. Length must match chunks.
        chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
        document_id: Optional unique document identifier applied to all chunks.

    Returns:
        A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.

    Raises:
        FileNotFoundError: If file_path does not exist.
        ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
    """
    if not chunks:
        return []

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    if page_numbers is not None and len(page_numbers) != len(chunks):
        raise ValueError(
            f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
        )

    if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
        raise ValueError(
            f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
        )

    filename = original_filename if original_filename else os.path.basename(file_path)
    upload_date = datetime.now().isoformat()

    metadata: List[Dict[str, Any]] = []
    for idx, chunk in enumerate(chunks):
        text = chunk if isinstance(chunk, str) else ""
        content_summary = text[:200]
        entry: Dict[str, Any] = {
            "filename": filename,
            "upload_date": upload_date,
            "content_summary": content_summary,
            "chunk_index": idx,
            "document_id": document_id,
        }
        page_num = page_numbers[idx] if page_numbers else None
        if page_num is not None:
            entry["page_number"] = page_num
        cfp = chunk_file_paths[idx] if chunk_file_paths else None
        if cfp is not None:
            entry["chunk_file_path"] = cfp
        metadata.append(entry)

    return metadata