legco_ai_assistant/backend/app/test/test_phase1_enhanced_metada...

"""Phase 1.5.4: Enhanced metadata tests.

Tests for extract_metadata() with new page_number, chunk_file_path,
and document_id fields. These fields are optional and maintain backward
compatibility with existing callers.
"""
from pathlib import Path
from datetime import datetime

import pytest
import importlib.util


# Dynamically load the metadata module (same pattern as test_phase1_metadata.py).
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py"
spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH))
assert spec is not None and spec.loader is not None
metadata_module = importlib.util.module_from_spec(spec)  # type: ignore
spec.loader.exec_module(metadata_module)  # type: ignore
extract_metadata = getattr(metadata_module, "extract_metadata")


def _is_iso8601(s: str) -> bool:
    try:
        datetime.fromisoformat(s)
        return True
    except ValueError:
        return False


# ── Test cases ──────────────────────────────────────────────────────────


def test_extract_metadata_with_page_numbers(tmp_path):
    """Each chunk should receive the corresponding page_number."""
    dummy_file = tmp_path / "doc.pdf"
    dummy_file.write_text("pdf content")

    chunks = ["chunk A", "chunk B", "chunk C"]
    page_numbers = [1, 2, 3]

    metadata = extract_metadata(
        str(dummy_file), chunks, page_numbers=page_numbers
    )

    assert len(metadata) == 3
    for i, m in enumerate(metadata):
        assert m["page_number"] == page_numbers[i]
        # Existing fields must still be present
        assert "filename" in m
        assert "chunk_index" in m


def test_extract_metadata_with_chunk_file_paths(tmp_path):
    """Each chunk should receive the corresponding chunk_file_path."""
    dummy_file = tmp_path / "report.pdf"
    dummy_file.write_text("report data")

    chunks = ["page-1 text", "page-2 text"]
    chunk_file_paths = ["report_page_1.pdf", "report_page_2.pdf"]

    metadata = extract_metadata(
        str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
    )

    assert len(metadata) == 2
    for i, m in enumerate(metadata):
        assert m["chunk_file_path"] == chunk_file_paths[i]


def test_extract_metadata_with_document_id(tmp_path):
    """All metadata dicts should contain the same document_id."""
    dummy_file = tmp_path / "memo.docx"
    dummy_file.write_text("memo")

    chunks = ["section A", "section B"]
    document_id = "test-uuid-123"

    metadata = extract_metadata(
        str(dummy_file), chunks, document_id=document_id
    )

    assert len(metadata) == 2
    for m in metadata:
        assert m["document_id"] == "test-uuid-123"


def test_extract_metadata_all_new_fields(tmp_path):
    """Pass all three new params together and verify complete structure."""
    dummy_file = tmp_path / "full.pdf"
    dummy_file.write_text("full doc")

    chunks = ["alpha", "beta", "gamma"]
    page_numbers = [5, 6, 7]
    chunk_file_paths = ["full_p5.txt", "full_p6.txt", "full_p7.txt"]
    document_id = "uuid-all-fields"

    metadata = extract_metadata(
        str(dummy_file),
        chunks,
        page_numbers=page_numbers,
        chunk_file_paths=chunk_file_paths,
        document_id=document_id,
    )

    assert len(metadata) == 3

    for i, m in enumerate(metadata):
        # New fields
        assert m["page_number"] == page_numbers[i]
        assert m["chunk_file_path"] == chunk_file_paths[i]
        assert m["document_id"] == document_id

        # Old fields must still be present and correct
        assert m["filename"] == "full.pdf"
        assert m["chunk_index"] == i
        assert _is_iso8601(m["upload_date"])
        assert m["content_summary"] == chunks[i]


def test_extract_metadata_without_new_fields(tmp_path):
    """Calling without any new params must be backward-compatible.

    The new fields (page_number, chunk_file_path, document_id) should not
    appear in the metadata dicts when not explicitly requested.
    """
    dummy_file = tmp_path / "legacy.txt"
    dummy_file.write_text("legacy")

    chunks = ["old-style chunk"]

    metadata = extract_metadata(str(dummy_file), chunks)

    assert len(metadata) == 1
    m = metadata[0]

    # New fields should be absent (or None — implementation choice).
    # Following the plan: these fields are optional additions.
    assert m.get("page_number") is None
    assert m.get("chunk_file_path") is None
    assert m.get("document_id") is None

    # Core fields intact
    assert m["filename"] == "legacy.txt"
    assert m["chunk_index"] == 0


def test_extract_metadata_page_numbers_length_mismatch(tmp_path):
    """If page_numbers length != chunks length, should raise ValueError."""
    dummy_file = tmp_path / "mismatch.pdf"
    dummy_file.write_text("data")

    chunks = ["a", "b", "c"]
    page_numbers = [1, 2]  # Only 2 for 3 chunks

    with pytest.raises(ValueError, match="page_numbers"):
        extract_metadata(str(dummy_file), chunks, page_numbers=page_numbers)


def test_extract_metadata_chunk_file_paths_length_mismatch(tmp_path):
    """If chunk_file_paths length != chunks length, should raise ValueError."""
    dummy_file = tmp_path / "mismatch2.pdf"
    dummy_file.write_text("data")

    chunks = ["x", "y"]
    chunk_file_paths = ["only_one.pdf"]  # 1 for 2 chunks

    with pytest.raises(ValueError, match="chunk_file_paths"):
        extract_metadata(
            str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
        )


def test_extract_metadata_page_numbers_none_in_list(tmp_path):
    """page_numbers can contain None for chunks without page info (e.g. DOCX)."""
    dummy_file = tmp_path / "mixed.docx"
    dummy_file.write_text("docx content")

    chunks = ["cover page", "body text"]
    page_numbers = [None, 1]

    metadata = extract_metadata(
        str(dummy_file), chunks, page_numbers=page_numbers
    )

    assert len(metadata) == 2
    assert "page_number" not in metadata[0]
    assert metadata[1]["page_number"] == 1