feat(backend): add page_number, chunk_file_path, document_id to chunk metadata

Enhance extract_metadata() with three new optional fields for page-aware chunking support. Validates list length mismatches. Fully backward compatible — existing callers unaffected. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:30:40 +08:00 · 2026-04-24 10:30:40 +08:00 · b97264c66a
parent 0995c685fa
commit b97264c66a
2 changed files with 230 additions and 21 deletions
--- a/backend/app/test/test_phase1_enhanced_metadata.py
+++ b/backend/app/test/test_phase1_enhanced_metadata.py
@ -0,0 +1,188 @@
+"""Phase 1.5.4: Enhanced metadata tests.
+
+Tests for extract_metadata() with new page_number, chunk_file_path,
+and document_id fields. These fields are optional and maintain backward
+compatibility with existing callers.
+"""
+from pathlib import Path
+from datetime import datetime
+
+import pytest
+import importlib.util
+
+
+# Dynamically load the metadata module (same pattern as test_phase1_metadata.py).
+MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py"
+spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH))
+assert spec is not None and spec.loader is not None
+metadata_module = importlib.util.module_from_spec(spec)  # type: ignore
+spec.loader.exec_module(metadata_module)  # type: ignore
+extract_metadata = getattr(metadata_module, "extract_metadata")
+
+
+def _is_iso8601(s: str) -> bool:
+    try:
+        datetime.fromisoformat(s)
+        return True
+    except ValueError:
+        return False
+
+
+# ── Test cases ──────────────────────────────────────────────────────────
+
+
+def test_extract_metadata_with_page_numbers(tmp_path):
+    """Each chunk should receive the corresponding page_number."""
+    dummy_file = tmp_path / "doc.pdf"
+    dummy_file.write_text("pdf content")
+
+    chunks = ["chunk A", "chunk B", "chunk C"]
+    page_numbers = [1, 2, 3]
+
+    metadata = extract_metadata(
+        str(dummy_file), chunks, page_numbers=page_numbers
+    )
+
+    assert len(metadata) == 3
+    for i, m in enumerate(metadata):
+        assert m["page_number"] == page_numbers[i]
+        # Existing fields must still be present
+        assert "filename" in m
+        assert "chunk_index" in m
+
+
+def test_extract_metadata_with_chunk_file_paths(tmp_path):
+    """Each chunk should receive the corresponding chunk_file_path."""
+    dummy_file = tmp_path / "report.pdf"
+    dummy_file.write_text("report data")
+
+    chunks = ["page-1 text", "page-2 text"]
+    chunk_file_paths = ["report_page_1.pdf", "report_page_2.pdf"]
+
+    metadata = extract_metadata(
+        str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
+    )
+
+    assert len(metadata) == 2
+    for i, m in enumerate(metadata):
+        assert m["chunk_file_path"] == chunk_file_paths[i]
+
+
+def test_extract_metadata_with_document_id(tmp_path):
+    """All metadata dicts should contain the same document_id."""
+    dummy_file = tmp_path / "memo.docx"
+    dummy_file.write_text("memo")
+
+    chunks = ["section A", "section B"]
+    document_id = "test-uuid-123"
+
+    metadata = extract_metadata(
+        str(dummy_file), chunks, document_id=document_id
+    )
+
+    assert len(metadata) == 2
+    for m in metadata:
+        assert m["document_id"] == "test-uuid-123"
+
+
+def test_extract_metadata_all_new_fields(tmp_path):
+    """Pass all three new params together and verify complete structure."""
+    dummy_file = tmp_path / "full.pdf"
+    dummy_file.write_text("full doc")
+
+    chunks = ["alpha", "beta", "gamma"]
+    page_numbers = [5, 6, 7]
+    chunk_file_paths = ["full_p5.txt", "full_p6.txt", "full_p7.txt"]
+    document_id = "uuid-all-fields"
+
+    metadata = extract_metadata(
+        str(dummy_file),
+        chunks,
+        page_numbers=page_numbers,
+        chunk_file_paths=chunk_file_paths,
+        document_id=document_id,
+    )
+
+    assert len(metadata) == 3
+
+    for i, m in enumerate(metadata):
+        # New fields
+        assert m["page_number"] == page_numbers[i]
+        assert m["chunk_file_path"] == chunk_file_paths[i]
+        assert m["document_id"] == document_id
+
+        # Old fields must still be present and correct
+        assert m["filename"] == "full.pdf"
+        assert m["chunk_index"] == i
+        assert _is_iso8601(m["upload_date"])
+        assert m["content_summary"] == chunks[i]
+
+
+def test_extract_metadata_without_new_fields(tmp_path):
+    """Calling without any new params must be backward-compatible.
+
+    The new fields (page_number, chunk_file_path, document_id) should not
+    appear in the metadata dicts when not explicitly requested.
+    """
+    dummy_file = tmp_path / "legacy.txt"
+    dummy_file.write_text("legacy")
+
+    chunks = ["old-style chunk"]
+
+    metadata = extract_metadata(str(dummy_file), chunks)
+
+    assert len(metadata) == 1
+    m = metadata[0]
+
+    # New fields should be absent (or None — implementation choice).
+    # Following the plan: these fields are optional additions.
+    assert m.get("page_number") is None
+    assert m.get("chunk_file_path") is None
+    assert m.get("document_id") is None
+
+    # Core fields intact
+    assert m["filename"] == "legacy.txt"
+    assert m["chunk_index"] == 0
+
+
+def test_extract_metadata_page_numbers_length_mismatch(tmp_path):
+    """If page_numbers length != chunks length, should raise ValueError."""
+    dummy_file = tmp_path / "mismatch.pdf"
+    dummy_file.write_text("data")
+
+    chunks = ["a", "b", "c"]
+    page_numbers = [1, 2]  # Only 2 for 3 chunks
+
+    with pytest.raises(ValueError, match="page_numbers"):
+        extract_metadata(str(dummy_file), chunks, page_numbers=page_numbers)
+
+
+def test_extract_metadata_chunk_file_paths_length_mismatch(tmp_path):
+    """If chunk_file_paths length != chunks length, should raise ValueError."""
+    dummy_file = tmp_path / "mismatch2.pdf"
+    dummy_file.write_text("data")
+
+    chunks = ["x", "y"]
+    chunk_file_paths = ["only_one.pdf"]  # 1 for 2 chunks
+
+    with pytest.raises(ValueError, match="chunk_file_paths"):
+        extract_metadata(
+            str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
+        )
+
+
+def test_extract_metadata_page_numbers_none_in_list(tmp_path):
+    """page_numbers can contain None for chunks without page info (e.g. DOCX)."""
+    dummy_file = tmp_path / "mixed.docx"
+    dummy_file.write_text("docx content")
+
+    chunks = ["cover page", "body text"]
+    page_numbers = [None, 1]
+
+    metadata = extract_metadata(
+        str(dummy_file), chunks, page_numbers=page_numbers
+    )
+
+    assert len(metadata) == 2
+    assert metadata[0]["page_number"] is None
+    assert metadata[1]["page_number"] == 1
--- a/backend/app/utils/metadata.py
+++ b/backend/app/utils/metadata.py
@ -2,54 +2,75 @@ from __future__ import annotations

 import os
 from datetime import datetime
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Optional


-def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]:
+def extract_metadata(
+    file_path: str,
+    chunks: List[str],
+    original_filename: str | None = None,
+    page_numbers: List[int | None] | None = None,
+    chunk_file_paths: List[str | None] | None = None,
+    document_id: str | None = None,
+) -> List[Dict[str, Any]]:
    """Extract metadata for a list of text chunks.

-    For each chunk, create a metadata dictionary containing:
-    - filename: basename of the provided file_path (or original_filename if provided)
-    - upload_date: ISO 8601 timestamp of when metadata was generated
-    - content_summary: first 200 characters of the chunk (or full chunk if shorter)
-    - chunk_index: 0-based index of the chunk
+    Core fields (always present):
+    - filename, upload_date, content_summary, chunk_index
+
+    Optional fields (present when provided, None otherwise):
+    - page_number: page source for the chunk (int or None for non-paginated docs)
+    - chunk_file_path: path to the per-chunk source file
+    - document_id: unique identifier linking all chunks to the same document

    Args:
        file_path: Path to the file associated with the chunks.
        chunks: List of string chunks to generate metadata for.
        original_filename: Override filename stored in metadata when file_path
            is a temp file.
+        page_numbers: Optional per-chunk page numbers. Length must match chunks.
+        chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
+        document_id: Optional unique document identifier applied to all chunks.

    Returns:
-        A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
+        A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.

    Raises:
-        FileNotFoundError: If the provided file_path does not exist.
+        FileNotFoundError: If file_path does not exist.
+        ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
    """
-
-    # Edge case: no chunks to metadataize
    if not chunks:
        return []

-    # Validate file existence up-front to follow the edge-case requirements
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

+    if page_numbers is not None and len(page_numbers) != len(chunks):
+        raise ValueError(
+            f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
+        )
+
+    if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
+        raise ValueError(
+            f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
+        )
+
    filename = original_filename if original_filename else os.path.basename(file_path)
    upload_date = datetime.now().isoformat()

    metadata: List[Dict[str, Any]] = []
    for idx, chunk in enumerate(chunks):
-        # Ensure we always have a string for summary extraction
        text = chunk if isinstance(chunk, str) else ""
        content_summary = text[:200]
-        metadata.append(
-            {
+        entry: Dict[str, Any] = {
            "filename": filename,
            "upload_date": upload_date,
            "content_summary": content_summary,
            "chunk_index": idx,
+            "page_number": page_numbers[idx] if page_numbers else None,
+            "chunk_file_path": chunk_file_paths[idx] if chunk_file_paths else None,
+            "document_id": document_id,
        }
-        )
+        metadata.append(entry)

    return metadata