feat(backend): refactor ingest pipeline for page-aware chunking with PDF generation

PDF uploads now use parse_pdf_by_page() -> chunk_pages() -> extract page PDFs -> enhanced metadata with page_number, chunk_file_path, and document_id. Same-filename replacement deletes old chunks and PDFs before re-ingest. DOCX/TXT keep original flat flow with document_id added. RAGService.ingest_document() accepts optional document_id parameter.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:53:17 +08:00
parent 8c84062996
commit b2dd385443
4 changed files with 548 additions and 22 deletions

View File

@ -2,6 +2,7 @@
import logging import logging
import os import os
import tempfile import tempfile
import uuid
from pathlib import Path from pathlib import Path
from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi import APIRouter, UploadFile, File, HTTPException
@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"])
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"} SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
"""Delete existing document with same filename from ChromaDB and chunk PDFs."""
doc_list, _, _ = rag.list_documents()
existing = [d for d in doc_list if d["filename"] == filename]
if not existing:
return
for doc in existing:
old_id = doc["document_id"]
chunks_info = rag.list_chunks(old_id)
for chunk in chunks_info:
chunk_file = chunk.get("chunk_file_path")
if chunk_file:
full_path = os.path.join(chunk_dir, chunk_file)
if os.path.exists(full_path):
os.unlink(full_path)
rag.delete_document(old_id)
logger.info("Deleted existing document %s (filename=%s)", old_id, filename)
@router.post("/ingest", response_model=IngestResponse) @router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)): async def ingest_document(file: UploadFile = File(...)):
"""Ingest a document into the RAG system.""" """Ingest a document into the RAG system."""
@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)):
logger.info("Ingesting file: %s (%d bytes)", filename, len(content)) logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
rag = RAGService(settings=settings)
chunk_dir = settings.document_chunk_path
_delete_existing_document(rag, filename, chunk_dir)
document_id = str(uuid.uuid4())
chunker = TokenChunkingStrategy(
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
)
if file_ext == ".pdf": if file_ext == ".pdf":
from app.utils.pdf_parser import parse_pdf from app.utils.pdf_parser import parse_pdf_by_page
text = parse_pdf(temp_path)
pages = parse_pdf_by_page(temp_path)
if not pages:
raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunk_texts = [text for text, _ in chunked]
page_numbers = [pn for _, pn in chunked]
os.makedirs(chunk_dir, exist_ok=True)
stem = Path(filename).stem
chunk_file_paths: list[str | None] = []
for page_num in page_numbers:
from app.utils.pdf_extractor import extract_page_as_pdf
chunk_filename = f"{stem}_page_{page_num}.pdf"
output_path = os.path.join(chunk_dir, chunk_filename)
try:
extract_page_as_pdf(temp_path, page_num, output_path)
chunk_file_paths.append(chunk_filename)
except Exception as exc:
logger.warning(
"Failed to extract page %d PDF for %s: %s",
page_num, filename, exc,
)
chunk_file_paths.append(None)
metadata = extract_metadata(
temp_path,
chunk_texts,
original_filename=filename,
page_numbers=page_numbers,
chunk_file_paths=chunk_file_paths,
document_id=document_id,
)
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
elif file_ext == ".docx": elif file_ext == ".docx":
from app.utils.docx_parser import parse_docx from app.utils.docx_parser import parse_docx
text = parse_docx(temp_path) text = parse_docx(temp_path)
chunks = chunker.chunk(text)
if not chunks:
raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
metadata = extract_metadata(
temp_path, chunks, original_filename=filename, document_id=document_id
)
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
elif file_ext == ".txt": elif file_ext == ".txt":
with open(temp_path, "r", encoding="utf-8") as f: with open(temp_path, "r", encoding="utf-8") as f:
text = f.read() text = f.read()
else:
text = ""
chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap) chunks = chunker.chunk(text)
chunks = chunker.chunk(text)
if not chunks: if not chunks:
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed") raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
metadata = extract_metadata(temp_path, chunks, original_filename=filename) metadata = extract_metadata(
temp_path, chunks, original_filename=filename, document_id=document_id
)
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
rag = RAGService(settings=settings) logger.info("Ingested %s: doc_id=%s", filename, document_id)
document_id = rag.ingest_document(temp_path, chunks, metadata)
logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks)
return IngestResponse( return IngestResponse(
document_id=document_id, document_id=document_id,
chunk_count=len(chunks), chunk_count=chunk_count,
filename=filename, filename=filename,
) )

View File

@ -42,11 +42,12 @@ class RAGService:
file_path: str, file_path: str,
chunks: List[str], chunks: List[str],
metadata_list: List[Dict[str, Any]], metadata_list: List[Dict[str, Any]],
document_id: Optional[str] = None,
) -> str: ) -> str:
if not chunks: if not chunks:
return "" return ""
document_id = str(uuid.uuid4()) document_id = document_id or str(uuid.uuid4())
ids = [f"{document_id}_{i}" for i in range(len(chunks))] ids = [f"{document_id}_{i}" for i in range(len(chunks))]
self.collection.add( self.collection.add(

View File

@ -27,14 +27,15 @@ class TestIngest:
with patch("app.services.rag.RAGService") as mock_rag_class: with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock() mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-123" mock_rag.ingest_document.return_value = "doc-123"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag mock_rag_class.return_value = mock_rag
with patch("app.utils.pdf_parser.parse_pdf") as mock_parse: with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
mock_parse.return_value = "Parsed PDF text content" mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class: with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
mock_chunker = MagicMock() mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"] mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
mock_chunk_class.return_value = mock_chunker mock_chunk_class.return_value = mock_chunker
with patch("app.utils.metadata.extract_metadata") as mock_meta: with patch("app.utils.metadata.extract_metadata") as mock_meta:
@ -43,10 +44,11 @@ class TestIngest:
{"filename": "test.pdf", "chunk_index": 1}, {"filename": "test.pdf", "chunk_index": 1},
] ]
response = client.post( with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
"/api/v1/ingest", response = client.post(
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")}, "/api/v1/ingest",
) files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
@ -61,6 +63,7 @@ class TestIngest:
with patch("app.services.rag.RAGService") as mock_rag_class: with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock() mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-456" mock_rag.ingest_document.return_value = "doc-456"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag mock_rag_class.return_value = mock_rag
with patch("app.utils.docx_parser.parse_docx") as mock_parse: with patch("app.utils.docx_parser.parse_docx") as mock_parse:

View File

@ -0,0 +1,435 @@
"""Phase 1.5.5c tests: Page-aware ingest router.
Covers:
1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf)
2. DOCX upload uses old pipeline with document_id
3. TXT upload uses old pipeline with document_id
4. Same-filename replacement: existing document found old chunks + PDFs deleted
5. Same-filename replacement: no existing document no deletion
6. Empty PDF (no pages with text) 400 error
7. Page PDFs saved to correct directory with correct naming
8. Metadata includes page_number and chunk_file_path for PDF uploads
9. Metadata does NOT include page_number for DOCX uploads (None)
"""
import io
import os
import uuid
from pathlib import Path
from unittest.mock import MagicMock, patch, call
import pytest
from fastapi.testclient import TestClient
class TestPageAwareIngest:
"""Page-aware document ingestion tests."""
@pytest.fixture
def client(self):
"""Create test client with mocked dependencies."""
from app.main import app
return TestClient(app)
@pytest.fixture
def mock_settings(self):
"""Mock settings with document_chunk_path."""
settings = MagicMock()
settings.chunk_size = 1000
settings.chunk_overlap = 200
settings.document_chunk_path = "/tmp/test_document_chunk"
return settings
# ------------------------------------------------------------------ #
# Test 1: PDF upload triggers page-aware pipeline
# ------------------------------------------------------------------ #
def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings):
"""PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \
patch("app.services.rag.RAGService.list_documents") as mock_list_docs:
# RAGService instance
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0))
# parse_pdf_by_page returns 2 pages
mock_parse_by_page.return_value = [
(1, "Page 1 text content"),
(2, "Page 2 text content"),
]
# chunk_pages returns one chunk per page
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [
("Page 1 text content", 1),
("Page 2 text content", 2),
]
mock_chunk_class.return_value = mock_chunker
# metadata
mock_meta.return_value = [
{"filename": "test.pdf", "chunk_index": 0, "page_number": 1},
{"filename": "test.pdf", "chunk_index": 1, "page_number": 2},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 2
assert data["filename"] == "test.pdf"
# Verify page-aware parsing was called
mock_parse_by_page.assert_called_once()
# Verify chunk_pages was used (not chunk)
mock_chunker.chunk_pages.assert_called_once()
mock_chunker.chunk.assert_not_called()
# Verify extract_page_as_pdf was called for each page
assert mock_extract_page.call_count == 2
# ------------------------------------------------------------------ #
# Test 2: DOCX upload uses old pipeline
# ------------------------------------------------------------------ #
def test_docx_upload_uses_old_pipeline(self, client, mock_settings):
"""DOCX should use parse_docx → chunk → metadata with document_id only."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse.return_value = "DOCX text content"
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 1
assert data["filename"] == "test.docx"
# Verify old pipeline: parse_docx → chunk (not chunk_pages)
mock_parse.assert_called_once()
mock_chunker.chunk.assert_called_once()
mock_chunker.chunk_pages.assert_not_called()
# Verify extract_metadata was called with document_id
meta_call = mock_meta.call_args
assert meta_call[1].get("document_id") is not None or \
(len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \
"document_id" in str(meta_call)
# ------------------------------------------------------------------ #
# Test 3: TXT upload uses old pipeline
# ------------------------------------------------------------------ #
def test_txt_upload_uses_old_pipeline(self, client, mock_settings):
"""TXT should read file → chunk → metadata with document_id."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["txt chunk"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 1
assert data["filename"] == "notes.txt"
mock_chunker.chunk.assert_called_once()
mock_chunker.chunk_pages.assert_not_called()
# ------------------------------------------------------------------ #
# Test 4: Same-filename replacement: existing document → deletion
# ------------------------------------------------------------------ #
def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path):
"""Uploading file with same filename should delete old chunks and chunk PDFs."""
doc_id = str(uuid.uuid4())
old_doc_id = "old-doc-uuid-1234"
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
old_pdf = chunk_dir / "test_page_3.pdf"
old_pdf.write_text("old chunk pdf")
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
# list_documents returns existing document with same filename
mock_rag.list_documents.return_value = (
[{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}],
1, 3
)
mock_rag_class.return_value = mock_rag
# list_chunks returns chunk with file path
mock_rag.list_chunks.return_value = [
{"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"},
{"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"},
]
mock_parse_by_page.return_value = [(1, "New page text")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("New page text", 1)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify delete_document was called for old doc
mock_rag.delete_document.assert_called_once_with(old_doc_id)
# ------------------------------------------------------------------ #
# Test 5: Same-filename replacement: no existing document
# ------------------------------------------------------------------ #
def test_no_existing_document_no_deletion(self, client, mock_settings):
"""Uploading new filename should NOT trigger any deletion."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [(1, "Page text")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("Page text", 1)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify NO deletion happened
mock_rag.delete_document.assert_not_called()
# ------------------------------------------------------------------ #
# Test 6: Empty PDF → 400 error
# ------------------------------------------------------------------ #
def test_empty_pdf_returns_400(self, client, mock_settings):
"""PDF with no extractable text should return 400."""
with patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
# Empty PDF: no pages
mock_parse_by_page.return_value = []
response = client.post(
"/api/v1/ingest",
files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 400
assert "empty" in response.json()["detail"].lower()
# ------------------------------------------------------------------ #
# Test 7: Page PDFs saved with correct naming
# ------------------------------------------------------------------ #
def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path):
"""Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata."""
doc_id = str(uuid.uuid4())
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [
(1, "Page 1"),
(3, "Page 3"), # page 2 was empty, skipped
]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [
("Page 1", 1),
("Page 3", 3),
]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [
{"filename": "NEC4 ACC.pdf", "chunk_index": 0},
{"filename": "NEC4 ACC.pdf", "chunk_index": 1},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify extract_page_as_pdf called with correct naming
calls = mock_extract_page.call_args_list
assert len(calls) == 2
# First call: page 1 → "NEC4 ACC_page_1.pdf"
output_path_1 = calls[0][0][2] # third positional arg = output_path
assert output_path_1.endswith("NEC4 ACC_page_1.pdf")
# Second call: page 3 → "NEC4 ACC_page_3.pdf"
output_path_3 = calls[1][0][2]
assert output_path_3.endswith("NEC4 ACC_page_3.pdf")
# Verify the directory was created
assert os.path.isdir(str(chunk_dir))
# ------------------------------------------------------------------ #
# Test 8: Metadata includes page_number and chunk_file_path for PDFs
# ------------------------------------------------------------------ #
def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path):
"""PDF metadata should include page_number and chunk_file_path."""
doc_id = str(uuid.uuid4())
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [(2, "Page 2 content")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [
{"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify extract_metadata was called with page_numbers and chunk_file_paths
meta_call_kwargs = mock_meta.call_args[1]
assert "page_numbers" in meta_call_kwargs
assert meta_call_kwargs["page_numbers"] == [2]
assert "chunk_file_paths" in meta_call_kwargs
assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"]
# ------------------------------------------------------------------ #
# Test 9: Metadata does NOT include page_number for DOCX (None)
# ------------------------------------------------------------------ #
def test_docx_metadata_no_page_info(self, client, mock_settings):
"""DOCX metadata should have page_number=None (no page_numbers passed)."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse.return_value = "DOCX content"
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
# Verify extract_metadata was called WITHOUT page_numbers
meta_call_kwargs = mock_meta.call_args[1]
assert meta_call_kwargs.get("page_numbers") is None
assert meta_call_kwargs.get("chunk_file_paths") is None