feat(backend): refactor ingest pipeline for page-aware chunking with PDF generation

PDF uploads now use parse_pdf_by_page() -> chunk_pages() -> extract page PDFs -> enhanced metadata with page_number, chunk_file_path, and document_id. Same-filename replacement deletes old chunks and PDFs before re-ingest. DOCX/TXT keep original flat flow with document_id added. RAGService.ingest_document() accepts optional document_id parameter. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:53:17 +08:00 · 2026-04-24 10:53:17 +08:00 · b2dd385443
parent 8c84062996
commit b2dd385443
4 changed files with 548 additions and 22 deletions
--- a/backend/app/routers/ingest.py
+++ b/backend/app/routers/ingest.py
@ -2,6 +2,7 @@
 import logging
 import os
 import tempfile
+import uuid
 from pathlib import Path

 from fastapi import APIRouter, UploadFile, File, HTTPException
@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"])
 SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}


+def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
+    """Delete existing document with same filename from ChromaDB and chunk PDFs."""
+    doc_list, _, _ = rag.list_documents()
+
+    existing = [d for d in doc_list if d["filename"] == filename]
+    if not existing:
+        return
+
+    for doc in existing:
+        old_id = doc["document_id"]
+        chunks_info = rag.list_chunks(old_id)
+        for chunk in chunks_info:
+            chunk_file = chunk.get("chunk_file_path")
+            if chunk_file:
+                full_path = os.path.join(chunk_dir, chunk_file)
+                if os.path.exists(full_path):
+                    os.unlink(full_path)
+        rag.delete_document(old_id)
+        logger.info("Deleted existing document %s (filename=%s)", old_id, filename)
+
+
@router.post("/ingest", response_model=IngestResponse)
 async def ingest_document(file: UploadFile = File(...)):
    """Ingest a document into the RAG system."""
@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)):

        logger.info("Ingesting file: %s (%d bytes)", filename, len(content))

+        rag = RAGService(settings=settings)
+        chunk_dir = settings.document_chunk_path
+        _delete_existing_document(rag, filename, chunk_dir)
+
+        document_id = str(uuid.uuid4())
+        chunker = TokenChunkingStrategy(
+            chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
+        )
+
        if file_ext == ".pdf":
-            from app.utils.pdf_parser import parse_pdf
-            text = parse_pdf(temp_path)
+            from app.utils.pdf_parser import parse_pdf_by_page
+
+            pages = parse_pdf_by_page(temp_path)
+
+            if not pages:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Document appears to be empty or could not be parsed",
+                )
+
+            chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
+            chunk_texts = [text for text, _ in chunked]
+            page_numbers = [pn for _, pn in chunked]
+
+            os.makedirs(chunk_dir, exist_ok=True)
+            stem = Path(filename).stem
+            chunk_file_paths: list[str | None] = []
+            for page_num in page_numbers:
+                from app.utils.pdf_extractor import extract_page_as_pdf
+
+                chunk_filename = f"{stem}_page_{page_num}.pdf"
+                output_path = os.path.join(chunk_dir, chunk_filename)
+                try:
+                    extract_page_as_pdf(temp_path, page_num, output_path)
+                    chunk_file_paths.append(chunk_filename)
+                except Exception as exc:
+                    logger.warning(
+                        "Failed to extract page %d PDF for %s: %s",
+                        page_num, filename, exc,
+                    )
+                    chunk_file_paths.append(None)
+
+            metadata = extract_metadata(
+                temp_path,
+                chunk_texts,
+                original_filename=filename,
+                page_numbers=page_numbers,
+                chunk_file_paths=chunk_file_paths,
+                document_id=document_id,
+            )
+
+            rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
+
        elif file_ext == ".docx":
            from app.utils.docx_parser import parse_docx
+
            text = parse_docx(temp_path)
+            chunks = chunker.chunk(text)
+
+            if not chunks:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Document appears to be empty or could not be parsed",
+                )
+
+            metadata = extract_metadata(
+                temp_path, chunks, original_filename=filename, document_id=document_id
+            )
+            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
+
        elif file_ext == ".txt":
            with open(temp_path, "r", encoding="utf-8") as f:
                text = f.read()
-        else:
-            text = ""

-        chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
-        chunks = chunker.chunk(text)
+            chunks = chunker.chunk(text)

-        if not chunks:
-            raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
+            if not chunks:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Document appears to be empty or could not be parsed",
+                )

-        metadata = extract_metadata(temp_path, chunks, original_filename=filename)
+            metadata = extract_metadata(
+                temp_path, chunks, original_filename=filename, document_id=document_id
+            )
+            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)

-        rag = RAGService(settings=settings)
-        document_id = rag.ingest_document(temp_path, chunks, metadata)
-
-        logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
+        logger.info("Ingested %s: doc_id=%s", filename, document_id)

+        chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks)
        return IngestResponse(
            document_id=document_id,
-            chunk_count=len(chunks),
+            chunk_count=chunk_count,
            filename=filename,
        )

--- a/backend/app/services/rag.py
+++ b/backend/app/services/rag.py
@ -42,11 +42,12 @@ class RAGService:
        file_path: str,
        chunks: List[str],
        metadata_list: List[Dict[str, Any]],
+        document_id: Optional[str] = None,
    ) -> str:
        if not chunks:
            return ""

-        document_id = str(uuid.uuid4())
+        document_id = document_id or str(uuid.uuid4())
        ids = [f"{document_id}_{i}" for i in range(len(chunks))]

        self.collection.add(
--- a/backend/app/test/test_phase1_ingest.py
+++ b/backend/app/test/test_phase1_ingest.py
@ -27,14 +27,15 @@ class TestIngest:
        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-123"
+            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag

-            with patch("app.utils.pdf_parser.parse_pdf") as mock_parse:
-                mock_parse.return_value = "Parsed PDF text content"
+            with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
+                mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]

                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
                    mock_chunker = MagicMock()
-                    mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"]
+                    mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
                    mock_chunk_class.return_value = mock_chunker

                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
@ -43,10 +44,11 @@ class TestIngest:
                            {"filename": "test.pdf", "chunk_index": 1},
                        ]

-                        response = client.post(
-                            "/api/v1/ingest",
-                            files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
-                        )
+                        with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
+                            response = client.post(
+                                "/api/v1/ingest",
+                                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+                            )

        assert response.status_code == 200
        data = response.json()
@ -61,6 +63,7 @@ class TestIngest:
        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-456"
+            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag

            with patch("app.utils.docx_parser.parse_docx") as mock_parse:
--- a/backend/app/test/test_phase1_ingest_page_aware.py
+++ b/backend/app/test/test_phase1_ingest_page_aware.py
@ -0,0 +1,435 @@
+"""Phase 1.5.5c tests: Page-aware ingest router.
+
+Covers:
+1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf)
+2. DOCX upload uses old pipeline with document_id
+3. TXT upload uses old pipeline with document_id
+4. Same-filename replacement: existing document found → old chunks + PDFs deleted
+5. Same-filename replacement: no existing document → no deletion
+6. Empty PDF (no pages with text) → 400 error
+7. Page PDFs saved to correct directory with correct naming
+8. Metadata includes page_number and chunk_file_path for PDF uploads
+9. Metadata does NOT include page_number for DOCX uploads (None)
+"""
+import io
+import os
+import uuid
+from pathlib import Path
+from unittest.mock import MagicMock, patch, call
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+class TestPageAwareIngest:
+    """Page-aware document ingestion tests."""
+
+    @pytest.fixture
+    def client(self):
+        """Create test client with mocked dependencies."""
+        from app.main import app
+        return TestClient(app)
+
+    @pytest.fixture
+    def mock_settings(self):
+        """Mock settings with document_chunk_path."""
+        settings = MagicMock()
+        settings.chunk_size = 1000
+        settings.chunk_overlap = 200
+        settings.document_chunk_path = "/tmp/test_document_chunk"
+        return settings
+
+    # ------------------------------------------------------------------ #
+    # Test 1: PDF upload triggers page-aware pipeline
+    # ------------------------------------------------------------------ #
+    def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings):
+        """PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf."""
+        doc_id = str(uuid.uuid4())
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta, \
+             patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \
+             patch("app.services.rag.RAGService.list_documents") as mock_list_docs:
+
+            # RAGService instance
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+            mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0))
+
+            # parse_pdf_by_page returns 2 pages
+            mock_parse_by_page.return_value = [
+                (1, "Page 1 text content"),
+                (2, "Page 2 text content"),
+            ]
+
+            # chunk_pages returns one chunk per page
+            mock_chunker = MagicMock()
+            mock_chunker.chunk_pages.return_value = [
+                ("Page 1 text content", 1),
+                ("Page 2 text content", 2),
+            ]
+            mock_chunk_class.return_value = mock_chunker
+
+            # metadata
+            mock_meta.return_value = [
+                {"filename": "test.pdf", "chunk_index": 0, "page_number": 1},
+                {"filename": "test.pdf", "chunk_index": 1, "page_number": 2},
+            ]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["chunk_count"] == 2
+        assert data["filename"] == "test.pdf"
+
+        # Verify page-aware parsing was called
+        mock_parse_by_page.assert_called_once()
+
+        # Verify chunk_pages was used (not chunk)
+        mock_chunker.chunk_pages.assert_called_once()
+        mock_chunker.chunk.assert_not_called()
+
+        # Verify extract_page_as_pdf was called for each page
+        assert mock_extract_page.call_count == 2
+
+    # ------------------------------------------------------------------ #
+    # Test 2: DOCX upload uses old pipeline
+    # ------------------------------------------------------------------ #
+    def test_docx_upload_uses_old_pipeline(self, client, mock_settings):
+        """DOCX should use parse_docx → chunk → metadata with document_id only."""
+        doc_id = str(uuid.uuid4())
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.docx_parser.parse_docx") as mock_parse, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta:
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_parse.return_value = "DOCX text content"
+
+            mock_chunker = MagicMock()
+            mock_chunker.chunk.return_value = ["chunk 1"]
+            mock_chunk_class.return_value = mock_chunker
+
+            mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
+            )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["chunk_count"] == 1
+        assert data["filename"] == "test.docx"
+
+        # Verify old pipeline: parse_docx → chunk (not chunk_pages)
+        mock_parse.assert_called_once()
+        mock_chunker.chunk.assert_called_once()
+        mock_chunker.chunk_pages.assert_not_called()
+
+        # Verify extract_metadata was called with document_id
+        meta_call = mock_meta.call_args
+        assert meta_call[1].get("document_id") is not None or \
+               (len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \
+               "document_id" in str(meta_call)
+
+    # ------------------------------------------------------------------ #
+    # Test 3: TXT upload uses old pipeline
+    # ------------------------------------------------------------------ #
+    def test_txt_upload_uses_old_pipeline(self, client, mock_settings):
+        """TXT should read file → chunk → metadata with document_id."""
+        doc_id = str(uuid.uuid4())
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta:
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_chunker = MagicMock()
+            mock_chunker.chunk.return_value = ["txt chunk"]
+            mock_chunk_class.return_value = mock_chunker
+
+            mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")},
+            )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["chunk_count"] == 1
+        assert data["filename"] == "notes.txt"
+
+        mock_chunker.chunk.assert_called_once()
+        mock_chunker.chunk_pages.assert_not_called()
+
+    # ------------------------------------------------------------------ #
+    # Test 4: Same-filename replacement: existing document → deletion
+    # ------------------------------------------------------------------ #
+    def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path):
+        """Uploading file with same filename should delete old chunks and chunk PDFs."""
+        doc_id = str(uuid.uuid4())
+        old_doc_id = "old-doc-uuid-1234"
+        chunk_dir = tmp_path / "document_chunk"
+        chunk_dir.mkdir()
+        old_pdf = chunk_dir / "test_page_3.pdf"
+        old_pdf.write_text("old chunk pdf")
+
+        mock_settings.document_chunk_path = str(chunk_dir)
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta, \
+             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            # list_documents returns existing document with same filename
+            mock_rag.list_documents.return_value = (
+                [{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}],
+                1, 3
+            )
+            mock_rag_class.return_value = mock_rag
+
+            # list_chunks returns chunk with file path
+            mock_rag.list_chunks.return_value = [
+                {"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"},
+                {"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"},
+            ]
+
+            mock_parse_by_page.return_value = [(1, "New page text")]
+            mock_chunker = MagicMock()
+            mock_chunker.chunk_pages.return_value = [("New page text", 1)]
+            mock_chunk_class.return_value = mock_chunker
+            mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 200
+
+        # Verify delete_document was called for old doc
+        mock_rag.delete_document.assert_called_once_with(old_doc_id)
+
+    # ------------------------------------------------------------------ #
+    # Test 5: Same-filename replacement: no existing document
+    # ------------------------------------------------------------------ #
+    def test_no_existing_document_no_deletion(self, client, mock_settings):
+        """Uploading new filename should NOT trigger any deletion."""
+        doc_id = str(uuid.uuid4())
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta, \
+             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_parse_by_page.return_value = [(1, "Page text")]
+            mock_chunker = MagicMock()
+            mock_chunker.chunk_pages.return_value = [("Page text", 1)]
+            mock_chunk_class.return_value = mock_chunker
+            mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 200
+
+        # Verify NO deletion happened
+        mock_rag.delete_document.assert_not_called()
+
+    # ------------------------------------------------------------------ #
+    # Test 6: Empty PDF → 400 error
+    # ------------------------------------------------------------------ #
+    def test_empty_pdf_returns_400(self, client, mock_settings):
+        """PDF with no extractable text should return 400."""
+        with patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.services.rag.RAGService") as mock_rag_class:
+
+            mock_rag = MagicMock()
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            # Empty PDF: no pages
+            mock_parse_by_page.return_value = []
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 400
+        assert "empty" in response.json()["detail"].lower()
+
+    # ------------------------------------------------------------------ #
+    # Test 7: Page PDFs saved with correct naming
+    # ------------------------------------------------------------------ #
+    def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path):
+        """Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata."""
+        doc_id = str(uuid.uuid4())
+        chunk_dir = tmp_path / "document_chunk"
+        chunk_dir.mkdir()
+        mock_settings.document_chunk_path = str(chunk_dir)
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta, \
+             patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page:
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_parse_by_page.return_value = [
+                (1, "Page 1"),
+                (3, "Page 3"),  # page 2 was empty, skipped
+            ]
+            mock_chunker = MagicMock()
+            mock_chunker.chunk_pages.return_value = [
+                ("Page 1", 1),
+                ("Page 3", 3),
+            ]
+            mock_chunk_class.return_value = mock_chunker
+            mock_meta.return_value = [
+                {"filename": "NEC4 ACC.pdf", "chunk_index": 0},
+                {"filename": "NEC4 ACC.pdf", "chunk_index": 1},
+            ]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 200
+
+        # Verify extract_page_as_pdf called with correct naming
+        calls = mock_extract_page.call_args_list
+        assert len(calls) == 2
+
+        # First call: page 1 → "NEC4 ACC_page_1.pdf"
+        output_path_1 = calls[0][0][2]  # third positional arg = output_path
+        assert output_path_1.endswith("NEC4 ACC_page_1.pdf")
+
+        # Second call: page 3 → "NEC4 ACC_page_3.pdf"
+        output_path_3 = calls[1][0][2]
+        assert output_path_3.endswith("NEC4 ACC_page_3.pdf")
+
+        # Verify the directory was created
+        assert os.path.isdir(str(chunk_dir))
+
+    # ------------------------------------------------------------------ #
+    # Test 8: Metadata includes page_number and chunk_file_path for PDFs
+    # ------------------------------------------------------------------ #
+    def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path):
+        """PDF metadata should include page_number and chunk_file_path."""
+        doc_id = str(uuid.uuid4())
+        chunk_dir = tmp_path / "document_chunk"
+        chunk_dir.mkdir()
+        mock_settings.document_chunk_path = str(chunk_dir)
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta, \
+             patch("app.utils.pdf_extractor.extract_page_as_pdf"):
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_parse_by_page.return_value = [(2, "Page 2 content")]
+            mock_chunker = MagicMock()
+            mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)]
+            mock_chunk_class.return_value = mock_chunker
+            mock_meta.return_value = [
+                {"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"},
+            ]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+            )
+
+        assert response.status_code == 200
+
+        # Verify extract_metadata was called with page_numbers and chunk_file_paths
+        meta_call_kwargs = mock_meta.call_args[1]
+        assert "page_numbers" in meta_call_kwargs
+        assert meta_call_kwargs["page_numbers"] == [2]
+        assert "chunk_file_paths" in meta_call_kwargs
+        assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"]
+
+    # ------------------------------------------------------------------ #
+    # Test 9: Metadata does NOT include page_number for DOCX (None)
+    # ------------------------------------------------------------------ #
+    def test_docx_metadata_no_page_info(self, client, mock_settings):
+        """DOCX metadata should have page_number=None (no page_numbers passed)."""
+        doc_id = str(uuid.uuid4())
+
+        with patch("app.services.rag.RAGService") as mock_rag_class, \
+             patch("app.core.config.get_settings", return_value=mock_settings), \
+             patch("app.utils.docx_parser.parse_docx") as mock_parse, \
+             patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
+             patch("app.utils.metadata.extract_metadata") as mock_meta:
+
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = doc_id
+            mock_rag.list_documents.return_value = ([], 0, 0)
+            mock_rag_class.return_value = mock_rag
+
+            mock_parse.return_value = "DOCX content"
+            mock_chunker = MagicMock()
+            mock_chunker.chunk.return_value = ["chunk 1"]
+            mock_chunk_class.return_value = mock_chunker
+            mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
+
+            response = client.post(
+                "/api/v1/ingest",
+                files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
+            )
+
+        assert response.status_code == 200
+
+        # Verify extract_metadata was called WITHOUT page_numbers
+        meta_call_kwargs = mock_meta.call_args[1]
+        assert meta_call_kwargs.get("page_numbers") is None
+        assert meta_call_kwargs.get("chunk_file_paths") is None