diff --git a/backend/app/routers/ingest.py b/backend/app/routers/ingest.py index deb4495..62f3957 100644 --- a/backend/app/routers/ingest.py +++ b/backend/app/routers/ingest.py @@ -2,6 +2,7 @@ import logging import os import tempfile +import uuid from pathlib import Path from fastapi import APIRouter, UploadFile, File, HTTPException @@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"]) SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"} +def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None: + """Delete existing document with same filename from ChromaDB and chunk PDFs.""" + doc_list, _, _ = rag.list_documents() + + existing = [d for d in doc_list if d["filename"] == filename] + if not existing: + return + + for doc in existing: + old_id = doc["document_id"] + chunks_info = rag.list_chunks(old_id) + for chunk in chunks_info: + chunk_file = chunk.get("chunk_file_path") + if chunk_file: + full_path = os.path.join(chunk_dir, chunk_file) + if os.path.exists(full_path): + os.unlink(full_path) + rag.delete_document(old_id) + logger.info("Deleted existing document %s (filename=%s)", old_id, filename) + + @router.post("/ingest", response_model=IngestResponse) async def ingest_document(file: UploadFile = File(...)): """Ingest a document into the RAG system.""" @@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)): logger.info("Ingesting file: %s (%d bytes)", filename, len(content)) + rag = RAGService(settings=settings) + chunk_dir = settings.document_chunk_path + _delete_existing_document(rag, filename, chunk_dir) + + document_id = str(uuid.uuid4()) + chunker = TokenChunkingStrategy( + chunk_size=settings.chunk_size, overlap=settings.chunk_overlap + ) + if file_ext == ".pdf": - from app.utils.pdf_parser import parse_pdf - text = parse_pdf(temp_path) + from app.utils.pdf_parser import parse_pdf_by_page + + pages = parse_pdf_by_page(temp_path) + + if not pages: + raise HTTPException( + status_code=400, + detail="Document appears to be empty or could not be parsed", + ) + + chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap) + chunk_texts = [text for text, _ in chunked] + page_numbers = [pn for _, pn in chunked] + + os.makedirs(chunk_dir, exist_ok=True) + stem = Path(filename).stem + chunk_file_paths: list[str | None] = [] + for page_num in page_numbers: + from app.utils.pdf_extractor import extract_page_as_pdf + + chunk_filename = f"{stem}_page_{page_num}.pdf" + output_path = os.path.join(chunk_dir, chunk_filename) + try: + extract_page_as_pdf(temp_path, page_num, output_path) + chunk_file_paths.append(chunk_filename) + except Exception as exc: + logger.warning( + "Failed to extract page %d PDF for %s: %s", + page_num, filename, exc, + ) + chunk_file_paths.append(None) + + metadata = extract_metadata( + temp_path, + chunk_texts, + original_filename=filename, + page_numbers=page_numbers, + chunk_file_paths=chunk_file_paths, + document_id=document_id, + ) + + rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id) + elif file_ext == ".docx": from app.utils.docx_parser import parse_docx + text = parse_docx(temp_path) + chunks = chunker.chunk(text) + + if not chunks: + raise HTTPException( + status_code=400, + detail="Document appears to be empty or could not be parsed", + ) + + metadata = extract_metadata( + temp_path, chunks, original_filename=filename, document_id=document_id + ) + rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) + elif file_ext == ".txt": with open(temp_path, "r", encoding="utf-8") as f: text = f.read() - else: - text = "" - chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap) - chunks = chunker.chunk(text) + chunks = chunker.chunk(text) - if not chunks: - raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed") + if not chunks: + raise HTTPException( + status_code=400, + detail="Document appears to be empty or could not be parsed", + ) - metadata = extract_metadata(temp_path, chunks, original_filename=filename) + metadata = extract_metadata( + temp_path, chunks, original_filename=filename, document_id=document_id + ) + rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) - rag = RAGService(settings=settings) - document_id = rag.ingest_document(temp_path, chunks, metadata) - - logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id) + logger.info("Ingested %s: doc_id=%s", filename, document_id) + chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks) return IngestResponse( document_id=document_id, - chunk_count=len(chunks), + chunk_count=chunk_count, filename=filename, ) diff --git a/backend/app/services/rag.py b/backend/app/services/rag.py index 515bba2..bc3a017 100644 --- a/backend/app/services/rag.py +++ b/backend/app/services/rag.py @@ -42,11 +42,12 @@ class RAGService: file_path: str, chunks: List[str], metadata_list: List[Dict[str, Any]], + document_id: Optional[str] = None, ) -> str: if not chunks: return "" - document_id = str(uuid.uuid4()) + document_id = document_id or str(uuid.uuid4()) ids = [f"{document_id}_{i}" for i in range(len(chunks))] self.collection.add( diff --git a/backend/app/test/test_phase1_ingest.py b/backend/app/test/test_phase1_ingest.py index 9ab4cce..bce3ad7 100644 --- a/backend/app/test/test_phase1_ingest.py +++ b/backend/app/test/test_phase1_ingest.py @@ -27,14 +27,15 @@ class TestIngest: with patch("app.services.rag.RAGService") as mock_rag_class: mock_rag = MagicMock() mock_rag.ingest_document.return_value = "doc-123" + mock_rag.list_documents.return_value = ([], 0, 0) mock_rag_class.return_value = mock_rag - with patch("app.utils.pdf_parser.parse_pdf") as mock_parse: - mock_parse.return_value = "Parsed PDF text content" + with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse: + mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")] with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class: mock_chunker = MagicMock() - mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"] + mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)] mock_chunk_class.return_value = mock_chunker with patch("app.utils.metadata.extract_metadata") as mock_meta: @@ -43,10 +44,11 @@ class TestIngest: {"filename": "test.pdf", "chunk_index": 1}, ] - response = client.post( - "/api/v1/ingest", - files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, - ) + with patch("app.utils.pdf_extractor.extract_page_as_pdf"): + response = client.post( + "/api/v1/ingest", + files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) assert response.status_code == 200 data = response.json() @@ -61,6 +63,7 @@ class TestIngest: with patch("app.services.rag.RAGService") as mock_rag_class: mock_rag = MagicMock() mock_rag.ingest_document.return_value = "doc-456" + mock_rag.list_documents.return_value = ([], 0, 0) mock_rag_class.return_value = mock_rag with patch("app.utils.docx_parser.parse_docx") as mock_parse: diff --git a/backend/app/test/test_phase1_ingest_page_aware.py b/backend/app/test/test_phase1_ingest_page_aware.py new file mode 100644 index 0000000..1a6e4f6 --- /dev/null +++ b/backend/app/test/test_phase1_ingest_page_aware.py @@ -0,0 +1,435 @@ +"""Phase 1.5.5c tests: Page-aware ingest router. + +Covers: +1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf) +2. DOCX upload uses old pipeline with document_id +3. TXT upload uses old pipeline with document_id +4. Same-filename replacement: existing document found → old chunks + PDFs deleted +5. Same-filename replacement: no existing document → no deletion +6. Empty PDF (no pages with text) → 400 error +7. Page PDFs saved to correct directory with correct naming +8. Metadata includes page_number and chunk_file_path for PDF uploads +9. Metadata does NOT include page_number for DOCX uploads (None) +""" +import io +import os +import uuid +from pathlib import Path +from unittest.mock import MagicMock, patch, call + +import pytest +from fastapi.testclient import TestClient + + +class TestPageAwareIngest: + """Page-aware document ingestion tests.""" + + @pytest.fixture + def client(self): + """Create test client with mocked dependencies.""" + from app.main import app + return TestClient(app) + + @pytest.fixture + def mock_settings(self): + """Mock settings with document_chunk_path.""" + settings = MagicMock() + settings.chunk_size = 1000 + settings.chunk_overlap = 200 + settings.document_chunk_path = "/tmp/test_document_chunk" + return settings + + # ------------------------------------------------------------------ # + # Test 1: PDF upload triggers page-aware pipeline + # ------------------------------------------------------------------ # + def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings): + """PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf.""" + doc_id = str(uuid.uuid4()) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta, \ + patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \ + patch("app.services.rag.RAGService.list_documents") as mock_list_docs: + + # RAGService instance + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0)) + + # parse_pdf_by_page returns 2 pages + mock_parse_by_page.return_value = [ + (1, "Page 1 text content"), + (2, "Page 2 text content"), + ] + + # chunk_pages returns one chunk per page + mock_chunker = MagicMock() + mock_chunker.chunk_pages.return_value = [ + ("Page 1 text content", 1), + ("Page 2 text content", 2), + ] + mock_chunk_class.return_value = mock_chunker + + # metadata + mock_meta.return_value = [ + {"filename": "test.pdf", "chunk_index": 0, "page_number": 1}, + {"filename": "test.pdf", "chunk_index": 1, "page_number": 2}, + ] + + response = client.post( + "/api/v1/ingest", + files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["chunk_count"] == 2 + assert data["filename"] == "test.pdf" + + # Verify page-aware parsing was called + mock_parse_by_page.assert_called_once() + + # Verify chunk_pages was used (not chunk) + mock_chunker.chunk_pages.assert_called_once() + mock_chunker.chunk.assert_not_called() + + # Verify extract_page_as_pdf was called for each page + assert mock_extract_page.call_count == 2 + + # ------------------------------------------------------------------ # + # Test 2: DOCX upload uses old pipeline + # ------------------------------------------------------------------ # + def test_docx_upload_uses_old_pipeline(self, client, mock_settings): + """DOCX should use parse_docx → chunk → metadata with document_id only.""" + doc_id = str(uuid.uuid4()) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.docx_parser.parse_docx") as mock_parse, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta: + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_parse.return_value = "DOCX text content" + + mock_chunker = MagicMock() + mock_chunker.chunk.return_value = ["chunk 1"] + mock_chunk_class.return_value = mock_chunker + + mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}] + + response = client.post( + "/api/v1/ingest", + files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["chunk_count"] == 1 + assert data["filename"] == "test.docx" + + # Verify old pipeline: parse_docx → chunk (not chunk_pages) + mock_parse.assert_called_once() + mock_chunker.chunk.assert_called_once() + mock_chunker.chunk_pages.assert_not_called() + + # Verify extract_metadata was called with document_id + meta_call = mock_meta.call_args + assert meta_call[1].get("document_id") is not None or \ + (len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \ + "document_id" in str(meta_call) + + # ------------------------------------------------------------------ # + # Test 3: TXT upload uses old pipeline + # ------------------------------------------------------------------ # + def test_txt_upload_uses_old_pipeline(self, client, mock_settings): + """TXT should read file → chunk → metadata with document_id.""" + doc_id = str(uuid.uuid4()) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta: + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_chunker = MagicMock() + mock_chunker.chunk.return_value = ["txt chunk"] + mock_chunk_class.return_value = mock_chunker + + mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}] + + response = client.post( + "/api/v1/ingest", + files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["chunk_count"] == 1 + assert data["filename"] == "notes.txt" + + mock_chunker.chunk.assert_called_once() + mock_chunker.chunk_pages.assert_not_called() + + # ------------------------------------------------------------------ # + # Test 4: Same-filename replacement: existing document → deletion + # ------------------------------------------------------------------ # + def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path): + """Uploading file with same filename should delete old chunks and chunk PDFs.""" + doc_id = str(uuid.uuid4()) + old_doc_id = "old-doc-uuid-1234" + chunk_dir = tmp_path / "document_chunk" + chunk_dir.mkdir() + old_pdf = chunk_dir / "test_page_3.pdf" + old_pdf.write_text("old chunk pdf") + + mock_settings.document_chunk_path = str(chunk_dir) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta, \ + patch("app.utils.pdf_extractor.extract_page_as_pdf"): + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + # list_documents returns existing document with same filename + mock_rag.list_documents.return_value = ( + [{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}], + 1, 3 + ) + mock_rag_class.return_value = mock_rag + + # list_chunks returns chunk with file path + mock_rag.list_chunks.return_value = [ + {"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"}, + {"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"}, + ] + + mock_parse_by_page.return_value = [(1, "New page text")] + mock_chunker = MagicMock() + mock_chunker.chunk_pages.return_value = [("New page text", 1)] + mock_chunk_class.return_value = mock_chunker + mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}] + + response = client.post( + "/api/v1/ingest", + files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 200 + + # Verify delete_document was called for old doc + mock_rag.delete_document.assert_called_once_with(old_doc_id) + + # ------------------------------------------------------------------ # + # Test 5: Same-filename replacement: no existing document + # ------------------------------------------------------------------ # + def test_no_existing_document_no_deletion(self, client, mock_settings): + """Uploading new filename should NOT trigger any deletion.""" + doc_id = str(uuid.uuid4()) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta, \ + patch("app.utils.pdf_extractor.extract_page_as_pdf"): + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_parse_by_page.return_value = [(1, "Page text")] + mock_chunker = MagicMock() + mock_chunker.chunk_pages.return_value = [("Page text", 1)] + mock_chunk_class.return_value = mock_chunker + mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}] + + response = client.post( + "/api/v1/ingest", + files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 200 + + # Verify NO deletion happened + mock_rag.delete_document.assert_not_called() + + # ------------------------------------------------------------------ # + # Test 6: Empty PDF → 400 error + # ------------------------------------------------------------------ # + def test_empty_pdf_returns_400(self, client, mock_settings): + """PDF with no extractable text should return 400.""" + with patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.services.rag.RAGService") as mock_rag_class: + + mock_rag = MagicMock() + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + # Empty PDF: no pages + mock_parse_by_page.return_value = [] + + response = client.post( + "/api/v1/ingest", + files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 400 + assert "empty" in response.json()["detail"].lower() + + # ------------------------------------------------------------------ # + # Test 7: Page PDFs saved with correct naming + # ------------------------------------------------------------------ # + def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path): + """Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata.""" + doc_id = str(uuid.uuid4()) + chunk_dir = tmp_path / "document_chunk" + chunk_dir.mkdir() + mock_settings.document_chunk_path = str(chunk_dir) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta, \ + patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page: + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_parse_by_page.return_value = [ + (1, "Page 1"), + (3, "Page 3"), # page 2 was empty, skipped + ] + mock_chunker = MagicMock() + mock_chunker.chunk_pages.return_value = [ + ("Page 1", 1), + ("Page 3", 3), + ] + mock_chunk_class.return_value = mock_chunker + mock_meta.return_value = [ + {"filename": "NEC4 ACC.pdf", "chunk_index": 0}, + {"filename": "NEC4 ACC.pdf", "chunk_index": 1}, + ] + + response = client.post( + "/api/v1/ingest", + files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 200 + + # Verify extract_page_as_pdf called with correct naming + calls = mock_extract_page.call_args_list + assert len(calls) == 2 + + # First call: page 1 → "NEC4 ACC_page_1.pdf" + output_path_1 = calls[0][0][2] # third positional arg = output_path + assert output_path_1.endswith("NEC4 ACC_page_1.pdf") + + # Second call: page 3 → "NEC4 ACC_page_3.pdf" + output_path_3 = calls[1][0][2] + assert output_path_3.endswith("NEC4 ACC_page_3.pdf") + + # Verify the directory was created + assert os.path.isdir(str(chunk_dir)) + + # ------------------------------------------------------------------ # + # Test 8: Metadata includes page_number and chunk_file_path for PDFs + # ------------------------------------------------------------------ # + def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path): + """PDF metadata should include page_number and chunk_file_path.""" + doc_id = str(uuid.uuid4()) + chunk_dir = tmp_path / "document_chunk" + chunk_dir.mkdir() + mock_settings.document_chunk_path = str(chunk_dir) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta, \ + patch("app.utils.pdf_extractor.extract_page_as_pdf"): + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_parse_by_page.return_value = [(2, "Page 2 content")] + mock_chunker = MagicMock() + mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)] + mock_chunk_class.return_value = mock_chunker + mock_meta.return_value = [ + {"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"}, + ] + + response = client.post( + "/api/v1/ingest", + files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, + ) + + assert response.status_code == 200 + + # Verify extract_metadata was called with page_numbers and chunk_file_paths + meta_call_kwargs = mock_meta.call_args[1] + assert "page_numbers" in meta_call_kwargs + assert meta_call_kwargs["page_numbers"] == [2] + assert "chunk_file_paths" in meta_call_kwargs + assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"] + + # ------------------------------------------------------------------ # + # Test 9: Metadata does NOT include page_number for DOCX (None) + # ------------------------------------------------------------------ # + def test_docx_metadata_no_page_info(self, client, mock_settings): + """DOCX metadata should have page_number=None (no page_numbers passed).""" + doc_id = str(uuid.uuid4()) + + with patch("app.services.rag.RAGService") as mock_rag_class, \ + patch("app.core.config.get_settings", return_value=mock_settings), \ + patch("app.utils.docx_parser.parse_docx") as mock_parse, \ + patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \ + patch("app.utils.metadata.extract_metadata") as mock_meta: + + mock_rag = MagicMock() + mock_rag.ingest_document.return_value = doc_id + mock_rag.list_documents.return_value = ([], 0, 0) + mock_rag_class.return_value = mock_rag + + mock_parse.return_value = "DOCX content" + mock_chunker = MagicMock() + mock_chunker.chunk.return_value = ["chunk 1"] + mock_chunk_class.return_value = mock_chunker + mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}] + + response = client.post( + "/api/v1/ingest", + files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")}, + ) + + assert response.status_code == 200 + + # Verify extract_metadata was called WITHOUT page_numbers + meta_call_kwargs = mock_meta.call_args[1] + assert meta_call_kwargs.get("page_numbers") is None + assert meta_call_kwargs.get("chunk_file_paths") is None