feat(backend): refactor ingest pipeline for page-aware chunking with PDF generation

PDF uploads now use parse_pdf_by_page() -> chunk_pages() -> extract page PDFs -> enhanced metadata with page_number, chunk_file_path, and document_id. Same-filename replacement deletes old chunks and PDFs before re-ingest. DOCX/TXT keep original flat flow with document_id added. RAGService.ingest_document() accepts optional document_id parameter. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:53:17 +08:00 · 2026-04-24 10:53:17 +08:00 · b2dd385443
parent 8c84062996
commit b2dd385443
4 changed files with 548 additions and 22 deletions
--- a/backend/app/routers/ingest.py
+++ b/backend/app/routers/ingest.py
@ -2,6 +2,7 @@
 import logging
 import os
 import tempfile
 import uuid
 from pathlib import Path
 from fastapi import APIRouter, UploadFile, File, HTTPException
@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"])
 SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
 def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
    """Delete existing document with same filename from ChromaDB and chunk PDFs."""
    doc_list, _, _ = rag.list_documents()
    existing = [d for d in doc_list if d["filename"] == filename]
    if not existing:
        return
    for doc in existing:
        old_id = doc["document_id"]
        chunks_info = rag.list_chunks(old_id)
        for chunk in chunks_info:
            chunk_file = chunk.get("chunk_file_path")
            if chunk_file:
                full_path = os.path.join(chunk_dir, chunk_file)
                if os.path.exists(full_path):
                    os.unlink(full_path)
        rag.delete_document(old_id)
        logger.info("Deleted existing document %s (filename=%s)", old_id, filename)
@router.post("/ingest", response_model=IngestResponse)
 async def ingest_document(file: UploadFile = File(...)):
    """Ingest a document into the RAG system."""
@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)):
        logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
        rag = RAGService(settings=settings)
        chunk_dir = settings.document_chunk_path
        _delete_existing_document(rag, filename, chunk_dir)
        document_id = str(uuid.uuid4())
        chunker = TokenChunkingStrategy(
            chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
        )
        if file_ext == ".pdf":
-            from app.utils.pdf_parser import parse_pdf
+            from app.utils.pdf_parser import parse_pdf_by_page
-            text = parse_pdf(temp_path)
+
            pages = parse_pdf_by_page(temp_path)
            if not pages:
                raise HTTPException(
                    status_code=400,
                    detail="Document appears to be empty or could not be parsed",
                )
            chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
            chunk_texts = [text for text, _ in chunked]
            page_numbers = [pn for _, pn in chunked]
            os.makedirs(chunk_dir, exist_ok=True)
            stem = Path(filename).stem
            chunk_file_paths: list[str | None] = []
            for page_num in page_numbers:
                from app.utils.pdf_extractor import extract_page_as_pdf
                chunk_filename = f"{stem}_page_{page_num}.pdf"
                output_path = os.path.join(chunk_dir, chunk_filename)
                try:
                    extract_page_as_pdf(temp_path, page_num, output_path)
                    chunk_file_paths.append(chunk_filename)
                except Exception as exc:
                    logger.warning(
                        "Failed to extract page %d PDF for %s: %s",
                        page_num, filename, exc,
                    )
                    chunk_file_paths.append(None)
            metadata = extract_metadata(
                temp_path,
                chunk_texts,
                original_filename=filename,
                page_numbers=page_numbers,
                chunk_file_paths=chunk_file_paths,
                document_id=document_id,
            )
            rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
        elif file_ext == ".docx":
            from app.utils.docx_parser import parse_docx
            text = parse_docx(temp_path)
            chunks = chunker.chunk(text)
            if not chunks:
                raise HTTPException(
                    status_code=400,
                    detail="Document appears to be empty or could not be parsed",
                )
            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename, document_id=document_id
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
        elif file_ext == ".txt":
            with open(temp_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            text = ""
-        chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
+            chunks = chunker.chunk(text)
        chunks = chunker.chunk(text)
-        if not chunks:
+            if not chunks:
-            raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
+                raise HTTPException(
                    status_code=400,
                    detail="Document appears to be empty or could not be parsed",
                )
-        metadata = extract_metadata(temp_path, chunks, original_filename=filename)
+            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename, document_id=document_id
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
-        rag = RAGService(settings=settings)
+        logger.info("Ingested %s: doc_id=%s", filename, document_id)
        document_id = rag.ingest_document(temp_path, chunks, metadata)
        logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
        chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks)
        return IngestResponse(
            document_id=document_id,
-            chunk_count=len(chunks),
+            chunk_count=chunk_count,
            filename=filename,
        )
--- a/backend/app/services/rag.py
+++ b/backend/app/services/rag.py
@ -42,11 +42,12 @@ class RAGService:
        file_path: str,
        chunks: List[str],
        metadata_list: List[Dict[str, Any]],
        document_id: Optional[str] = None,
    ) -> str:
        if not chunks:
            return ""
-        document_id = str(uuid.uuid4())
+        document_id = document_id or str(uuid.uuid4())
        ids = [f"{document_id}_{i}" for i in range(len(chunks))]
        self.collection.add(
--- a/backend/app/test/test_phase1_ingest.py
+++ b/backend/app/test/test_phase1_ingest.py
@ -27,14 +27,15 @@ class TestIngest:
        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-123"
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
-            with patch("app.utils.pdf_parser.parse_pdf") as mock_parse:
+            with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
-                mock_parse.return_value = "Parsed PDF text content"
+                mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
                    mock_chunker = MagicMock()
-                    mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"]
+                    mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
                    mock_chunk_class.return_value = mock_chunker
                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
@ -43,10 +44,11 @@ class TestIngest:
                            {"filename": "test.pdf", "chunk_index": 1},
                        ]
-                        response = client.post(
+                        with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
-                            "/api/v1/ingest",
+                            response = client.post(
-                            files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+                                "/api/v1/ingest",
-                        )
+                                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
                            )
        assert response.status_code == 200
        data = response.json()
@ -61,6 +63,7 @@ class TestIngest:
        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-456"
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            with patch("app.utils.docx_parser.parse_docx") as mock_parse:
--- a/backend/app/test/test_phase1_ingest_page_aware.py
+++ b/backend/app/test/test_phase1_ingest_page_aware.py
@ -0,0 +1,435 @@
 """Phase 1.5.5c tests: Page-aware ingest router.
 Covers:
 1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf)
 2. DOCX upload uses old pipeline with document_id
 3. TXT upload uses old pipeline with document_id
 4. Same-filename replacement: existing document found → old chunks + PDFs deleted
 5. Same-filename replacement: no existing document → no deletion
 6. Empty PDF (no pages with text) → 400 error
 7. Page PDFs saved to correct directory with correct naming
 8. Metadata includes page_number and chunk_file_path for PDF uploads
 9. Metadata does NOT include page_number for DOCX uploads (None)
 """
 import io
 import os
 import uuid
 from pathlib import Path
 from unittest.mock import MagicMock, patch, call
 import pytest
 from fastapi.testclient import TestClient
 class TestPageAwareIngest:
    """Page-aware document ingestion tests."""
    @pytest.fixture
    def client(self):
        """Create test client with mocked dependencies."""
        from app.main import app
        return TestClient(app)
    @pytest.fixture
    def mock_settings(self):
        """Mock settings with document_chunk_path."""
        settings = MagicMock()
        settings.chunk_size = 1000
        settings.chunk_overlap = 200
        settings.document_chunk_path = "/tmp/test_document_chunk"
        return settings
    # ------------------------------------------------------------------ #
    # Test 1: PDF upload triggers page-aware pipeline
    # ------------------------------------------------------------------ #
    def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings):
        """PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf."""
        doc_id = str(uuid.uuid4())
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta, \
             patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \
             patch("app.services.rag.RAGService.list_documents") as mock_list_docs:
            # RAGService instance
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0))
            # parse_pdf_by_page returns 2 pages
            mock_parse_by_page.return_value = [
                (1, "Page 1 text content"),
                (2, "Page 2 text content"),
            ]
            # chunk_pages returns one chunk per page
            mock_chunker = MagicMock()
            mock_chunker.chunk_pages.return_value = [
                ("Page 1 text content", 1),
                ("Page 2 text content", 2),
            ]
            mock_chunk_class.return_value = mock_chunker
            # metadata
            mock_meta.return_value = [
                {"filename": "test.pdf", "chunk_index": 0, "page_number": 1},
                {"filename": "test.pdf", "chunk_index": 1, "page_number": 2},
            ]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 200
        data = response.json()
        assert data["chunk_count"] == 2
        assert data["filename"] == "test.pdf"
        # Verify page-aware parsing was called
        mock_parse_by_page.assert_called_once()
        # Verify chunk_pages was used (not chunk)
        mock_chunker.chunk_pages.assert_called_once()
        mock_chunker.chunk.assert_not_called()
        # Verify extract_page_as_pdf was called for each page
        assert mock_extract_page.call_count == 2
    # ------------------------------------------------------------------ #
    # Test 2: DOCX upload uses old pipeline
    # ------------------------------------------------------------------ #
    def test_docx_upload_uses_old_pipeline(self, client, mock_settings):
        """DOCX should use parse_docx → chunk → metadata with document_id only."""
        doc_id = str(uuid.uuid4())
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.docx_parser.parse_docx") as mock_parse, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_parse.return_value = "DOCX text content"
            mock_chunker = MagicMock()
            mock_chunker.chunk.return_value = ["chunk 1"]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
            )
        assert response.status_code == 200
        data = response.json()
        assert data["chunk_count"] == 1
        assert data["filename"] == "test.docx"
        # Verify old pipeline: parse_docx → chunk (not chunk_pages)
        mock_parse.assert_called_once()
        mock_chunker.chunk.assert_called_once()
        mock_chunker.chunk_pages.assert_not_called()
        # Verify extract_metadata was called with document_id
        meta_call = mock_meta.call_args
        assert meta_call[1].get("document_id") is not None or \
               (len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \
               "document_id" in str(meta_call)
    # ------------------------------------------------------------------ #
    # Test 3: TXT upload uses old pipeline
    # ------------------------------------------------------------------ #
    def test_txt_upload_uses_old_pipeline(self, client, mock_settings):
        """TXT should read file → chunk → metadata with document_id."""
        doc_id = str(uuid.uuid4())
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_chunker = MagicMock()
            mock_chunker.chunk.return_value = ["txt chunk"]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")},
            )
        assert response.status_code == 200
        data = response.json()
        assert data["chunk_count"] == 1
        assert data["filename"] == "notes.txt"
        mock_chunker.chunk.assert_called_once()
        mock_chunker.chunk_pages.assert_not_called()
    # ------------------------------------------------------------------ #
    # Test 4: Same-filename replacement: existing document → deletion
    # ------------------------------------------------------------------ #
    def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path):
        """Uploading file with same filename should delete old chunks and chunk PDFs."""
        doc_id = str(uuid.uuid4())
        old_doc_id = "old-doc-uuid-1234"
        chunk_dir = tmp_path / "document_chunk"
        chunk_dir.mkdir()
        old_pdf = chunk_dir / "test_page_3.pdf"
        old_pdf.write_text("old chunk pdf")
        mock_settings.document_chunk_path = str(chunk_dir)
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta, \
             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            # list_documents returns existing document with same filename
            mock_rag.list_documents.return_value = (
                [{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}],
                1, 3
            )
            mock_rag_class.return_value = mock_rag
            # list_chunks returns chunk with file path
            mock_rag.list_chunks.return_value = [
                {"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"},
                {"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"},
            ]
            mock_parse_by_page.return_value = [(1, "New page text")]
            mock_chunker = MagicMock()
            mock_chunker.chunk_pages.return_value = [("New page text", 1)]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 200
        # Verify delete_document was called for old doc
        mock_rag.delete_document.assert_called_once_with(old_doc_id)
    # ------------------------------------------------------------------ #
    # Test 5: Same-filename replacement: no existing document
    # ------------------------------------------------------------------ #
    def test_no_existing_document_no_deletion(self, client, mock_settings):
        """Uploading new filename should NOT trigger any deletion."""
        doc_id = str(uuid.uuid4())
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta, \
             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_parse_by_page.return_value = [(1, "Page text")]
            mock_chunker = MagicMock()
            mock_chunker.chunk_pages.return_value = [("Page text", 1)]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 200
        # Verify NO deletion happened
        mock_rag.delete_document.assert_not_called()
    # ------------------------------------------------------------------ #
    # Test 6: Empty PDF → 400 error
    # ------------------------------------------------------------------ #
    def test_empty_pdf_returns_400(self, client, mock_settings):
        """PDF with no extractable text should return 400."""
        with patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            # Empty PDF: no pages
            mock_parse_by_page.return_value = []
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 400
        assert "empty" in response.json()["detail"].lower()
    # ------------------------------------------------------------------ #
    # Test 7: Page PDFs saved with correct naming
    # ------------------------------------------------------------------ #
    def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path):
        """Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata."""
        doc_id = str(uuid.uuid4())
        chunk_dir = tmp_path / "document_chunk"
        chunk_dir.mkdir()
        mock_settings.document_chunk_path = str(chunk_dir)
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta, \
             patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_parse_by_page.return_value = [
                (1, "Page 1"),
                (3, "Page 3"),  # page 2 was empty, skipped
            ]
            mock_chunker = MagicMock()
            mock_chunker.chunk_pages.return_value = [
                ("Page 1", 1),
                ("Page 3", 3),
            ]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [
                {"filename": "NEC4 ACC.pdf", "chunk_index": 0},
                {"filename": "NEC4 ACC.pdf", "chunk_index": 1},
            ]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 200
        # Verify extract_page_as_pdf called with correct naming
        calls = mock_extract_page.call_args_list
        assert len(calls) == 2
        # First call: page 1 → "NEC4 ACC_page_1.pdf"
        output_path_1 = calls[0][0][2]  # third positional arg = output_path
        assert output_path_1.endswith("NEC4 ACC_page_1.pdf")
        # Second call: page 3 → "NEC4 ACC_page_3.pdf"
        output_path_3 = calls[1][0][2]
        assert output_path_3.endswith("NEC4 ACC_page_3.pdf")
        # Verify the directory was created
        assert os.path.isdir(str(chunk_dir))
    # ------------------------------------------------------------------ #
    # Test 8: Metadata includes page_number and chunk_file_path for PDFs
    # ------------------------------------------------------------------ #
    def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path):
        """PDF metadata should include page_number and chunk_file_path."""
        doc_id = str(uuid.uuid4())
        chunk_dir = tmp_path / "document_chunk"
        chunk_dir.mkdir()
        mock_settings.document_chunk_path = str(chunk_dir)
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta, \
             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_parse_by_page.return_value = [(2, "Page 2 content")]
            mock_chunker = MagicMock()
            mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [
                {"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"},
            ]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
            )
        assert response.status_code == 200
        # Verify extract_metadata was called with page_numbers and chunk_file_paths
        meta_call_kwargs = mock_meta.call_args[1]
        assert "page_numbers" in meta_call_kwargs
        assert meta_call_kwargs["page_numbers"] == [2]
        assert "chunk_file_paths" in meta_call_kwargs
        assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"]
    # ------------------------------------------------------------------ #
    # Test 9: Metadata does NOT include page_number for DOCX (None)
    # ------------------------------------------------------------------ #
    def test_docx_metadata_no_page_info(self, client, mock_settings):
        """DOCX metadata should have page_number=None (no page_numbers passed)."""
        doc_id = str(uuid.uuid4())
        with patch("app.services.rag.RAGService") as mock_rag_class, \
             patch("app.core.config.get_settings", return_value=mock_settings), \
             patch("app.utils.docx_parser.parse_docx") as mock_parse, \
             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
             patch("app.utils.metadata.extract_metadata") as mock_meta:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = doc_id
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag
            mock_parse.return_value = "DOCX content"
            mock_chunker = MagicMock()
            mock_chunker.chunk.return_value = ["chunk 1"]
            mock_chunk_class.return_value = mock_chunker
            mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
            response = client.post(
                "/api/v1/ingest",
                files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
            )
        assert response.status_code == 200
        # Verify extract_metadata was called WITHOUT page_numbers
        meta_call_kwargs = mock_meta.call_args[1]
        assert meta_call_kwargs.get("page_numbers") is None
        assert meta_call_kwargs.get("chunk_file_paths") is None