feat(backend): refactor ingest pipeline for page-aware chunking with PDF generation

PDF uploads now use parse_pdf_by_page() -> chunk_pages() -> extract page PDFs -> enhanced metadata with page_number, chunk_file_path, and document_id. Same-filename replacement deletes old chunks and PDFs before re-ingest. DOCX/TXT keep original flat flow with document_id added. RAGService.ingest_document() accepts optional document_id parameter.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:53:17 +08:00
parent 8c84062996
commit b2dd385443
4 changed files with 548 additions and 22 deletions

View File

@ -2,6 +2,7 @@
import logging
import os
import tempfile
import uuid
from pathlib import Path
from fastapi import APIRouter, UploadFile, File, HTTPException
@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"])
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
"""Delete existing document with same filename from ChromaDB and chunk PDFs."""
doc_list, _, _ = rag.list_documents()
existing = [d for d in doc_list if d["filename"] == filename]
if not existing:
return
for doc in existing:
old_id = doc["document_id"]
chunks_info = rag.list_chunks(old_id)
for chunk in chunks_info:
chunk_file = chunk.get("chunk_file_path")
if chunk_file:
full_path = os.path.join(chunk_dir, chunk_file)
if os.path.exists(full_path):
os.unlink(full_path)
rag.delete_document(old_id)
logger.info("Deleted existing document %s (filename=%s)", old_id, filename)
@router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)):
"""Ingest a document into the RAG system."""
@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)):
logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
rag = RAGService(settings=settings)
chunk_dir = settings.document_chunk_path
_delete_existing_document(rag, filename, chunk_dir)
document_id = str(uuid.uuid4())
chunker = TokenChunkingStrategy(
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
)
if file_ext == ".pdf":
from app.utils.pdf_parser import parse_pdf
text = parse_pdf(temp_path)
from app.utils.pdf_parser import parse_pdf_by_page
pages = parse_pdf_by_page(temp_path)
if not pages:
raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunk_texts = [text for text, _ in chunked]
page_numbers = [pn for _, pn in chunked]
os.makedirs(chunk_dir, exist_ok=True)
stem = Path(filename).stem
chunk_file_paths: list[str | None] = []
for page_num in page_numbers:
from app.utils.pdf_extractor import extract_page_as_pdf
chunk_filename = f"{stem}_page_{page_num}.pdf"
output_path = os.path.join(chunk_dir, chunk_filename)
try:
extract_page_as_pdf(temp_path, page_num, output_path)
chunk_file_paths.append(chunk_filename)
except Exception as exc:
logger.warning(
"Failed to extract page %d PDF for %s: %s",
page_num, filename, exc,
)
chunk_file_paths.append(None)
metadata = extract_metadata(
temp_path,
chunk_texts,
original_filename=filename,
page_numbers=page_numbers,
chunk_file_paths=chunk_file_paths,
document_id=document_id,
)
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
elif file_ext == ".docx":
from app.utils.docx_parser import parse_docx
text = parse_docx(temp_path)
elif file_ext == ".txt":
with open(temp_path, "r", encoding="utf-8") as f:
text = f.read()
else:
text = ""
chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
text = parse_docx(temp_path)
chunks = chunker.chunk(text)
if not chunks:
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
metadata = extract_metadata(temp_path, chunks, original_filename=filename)
metadata = extract_metadata(
temp_path, chunks, original_filename=filename, document_id=document_id
)
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
rag = RAGService(settings=settings)
document_id = rag.ingest_document(temp_path, chunks, metadata)
elif file_ext == ".txt":
with open(temp_path, "r", encoding="utf-8") as f:
text = f.read()
logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
chunks = chunker.chunk(text)
if not chunks:
raise HTTPException(
status_code=400,
detail="Document appears to be empty or could not be parsed",
)
metadata = extract_metadata(
temp_path, chunks, original_filename=filename, document_id=document_id
)
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
logger.info("Ingested %s: doc_id=%s", filename, document_id)
chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks)
return IngestResponse(
document_id=document_id,
chunk_count=len(chunks),
chunk_count=chunk_count,
filename=filename,
)

View File

@ -42,11 +42,12 @@ class RAGService:
file_path: str,
chunks: List[str],
metadata_list: List[Dict[str, Any]],
document_id: Optional[str] = None,
) -> str:
if not chunks:
return ""
document_id = str(uuid.uuid4())
document_id = document_id or str(uuid.uuid4())
ids = [f"{document_id}_{i}" for i in range(len(chunks))]
self.collection.add(

View File

@ -27,14 +27,15 @@ class TestIngest:
with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-123"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
with patch("app.utils.pdf_parser.parse_pdf") as mock_parse:
mock_parse.return_value = "Parsed PDF text content"
with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"]
mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
mock_chunk_class.return_value = mock_chunker
with patch("app.utils.metadata.extract_metadata") as mock_meta:
@ -43,6 +44,7 @@ class TestIngest:
{"filename": "test.pdf", "chunk_index": 1},
]
with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
@ -61,6 +63,7 @@ class TestIngest:
with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-456"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
with patch("app.utils.docx_parser.parse_docx") as mock_parse:

View File

@ -0,0 +1,435 @@
"""Phase 1.5.5c tests: Page-aware ingest router.
Covers:
1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf)
2. DOCX upload uses old pipeline with document_id
3. TXT upload uses old pipeline with document_id
4. Same-filename replacement: existing document found old chunks + PDFs deleted
5. Same-filename replacement: no existing document no deletion
6. Empty PDF (no pages with text) 400 error
7. Page PDFs saved to correct directory with correct naming
8. Metadata includes page_number and chunk_file_path for PDF uploads
9. Metadata does NOT include page_number for DOCX uploads (None)
"""
import io
import os
import uuid
from pathlib import Path
from unittest.mock import MagicMock, patch, call
import pytest
from fastapi.testclient import TestClient
class TestPageAwareIngest:
"""Page-aware document ingestion tests."""
@pytest.fixture
def client(self):
"""Create test client with mocked dependencies."""
from app.main import app
return TestClient(app)
@pytest.fixture
def mock_settings(self):
"""Mock settings with document_chunk_path."""
settings = MagicMock()
settings.chunk_size = 1000
settings.chunk_overlap = 200
settings.document_chunk_path = "/tmp/test_document_chunk"
return settings
# ------------------------------------------------------------------ #
# Test 1: PDF upload triggers page-aware pipeline
# ------------------------------------------------------------------ #
def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings):
"""PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \
patch("app.services.rag.RAGService.list_documents") as mock_list_docs:
# RAGService instance
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0))
# parse_pdf_by_page returns 2 pages
mock_parse_by_page.return_value = [
(1, "Page 1 text content"),
(2, "Page 2 text content"),
]
# chunk_pages returns one chunk per page
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [
("Page 1 text content", 1),
("Page 2 text content", 2),
]
mock_chunk_class.return_value = mock_chunker
# metadata
mock_meta.return_value = [
{"filename": "test.pdf", "chunk_index": 0, "page_number": 1},
{"filename": "test.pdf", "chunk_index": 1, "page_number": 2},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 2
assert data["filename"] == "test.pdf"
# Verify page-aware parsing was called
mock_parse_by_page.assert_called_once()
# Verify chunk_pages was used (not chunk)
mock_chunker.chunk_pages.assert_called_once()
mock_chunker.chunk.assert_not_called()
# Verify extract_page_as_pdf was called for each page
assert mock_extract_page.call_count == 2
# ------------------------------------------------------------------ #
# Test 2: DOCX upload uses old pipeline
# ------------------------------------------------------------------ #
def test_docx_upload_uses_old_pipeline(self, client, mock_settings):
"""DOCX should use parse_docx → chunk → metadata with document_id only."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse.return_value = "DOCX text content"
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 1
assert data["filename"] == "test.docx"
# Verify old pipeline: parse_docx → chunk (not chunk_pages)
mock_parse.assert_called_once()
mock_chunker.chunk.assert_called_once()
mock_chunker.chunk_pages.assert_not_called()
# Verify extract_metadata was called with document_id
meta_call = mock_meta.call_args
assert meta_call[1].get("document_id") is not None or \
(len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \
"document_id" in str(meta_call)
# ------------------------------------------------------------------ #
# Test 3: TXT upload uses old pipeline
# ------------------------------------------------------------------ #
def test_txt_upload_uses_old_pipeline(self, client, mock_settings):
"""TXT should read file → chunk → metadata with document_id."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["txt chunk"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 1
assert data["filename"] == "notes.txt"
mock_chunker.chunk.assert_called_once()
mock_chunker.chunk_pages.assert_not_called()
# ------------------------------------------------------------------ #
# Test 4: Same-filename replacement: existing document → deletion
# ------------------------------------------------------------------ #
def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path):
"""Uploading file with same filename should delete old chunks and chunk PDFs."""
doc_id = str(uuid.uuid4())
old_doc_id = "old-doc-uuid-1234"
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
old_pdf = chunk_dir / "test_page_3.pdf"
old_pdf.write_text("old chunk pdf")
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
# list_documents returns existing document with same filename
mock_rag.list_documents.return_value = (
[{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}],
1, 3
)
mock_rag_class.return_value = mock_rag
# list_chunks returns chunk with file path
mock_rag.list_chunks.return_value = [
{"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"},
{"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"},
]
mock_parse_by_page.return_value = [(1, "New page text")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("New page text", 1)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify delete_document was called for old doc
mock_rag.delete_document.assert_called_once_with(old_doc_id)
# ------------------------------------------------------------------ #
# Test 5: Same-filename replacement: no existing document
# ------------------------------------------------------------------ #
def test_no_existing_document_no_deletion(self, client, mock_settings):
"""Uploading new filename should NOT trigger any deletion."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [(1, "Page text")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("Page text", 1)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify NO deletion happened
mock_rag.delete_document.assert_not_called()
# ------------------------------------------------------------------ #
# Test 6: Empty PDF → 400 error
# ------------------------------------------------------------------ #
def test_empty_pdf_returns_400(self, client, mock_settings):
"""PDF with no extractable text should return 400."""
with patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
# Empty PDF: no pages
mock_parse_by_page.return_value = []
response = client.post(
"/api/v1/ingest",
files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 400
assert "empty" in response.json()["detail"].lower()
# ------------------------------------------------------------------ #
# Test 7: Page PDFs saved with correct naming
# ------------------------------------------------------------------ #
def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path):
"""Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata."""
doc_id = str(uuid.uuid4())
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [
(1, "Page 1"),
(3, "Page 3"), # page 2 was empty, skipped
]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [
("Page 1", 1),
("Page 3", 3),
]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [
{"filename": "NEC4 ACC.pdf", "chunk_index": 0},
{"filename": "NEC4 ACC.pdf", "chunk_index": 1},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify extract_page_as_pdf called with correct naming
calls = mock_extract_page.call_args_list
assert len(calls) == 2
# First call: page 1 → "NEC4 ACC_page_1.pdf"
output_path_1 = calls[0][0][2] # third positional arg = output_path
assert output_path_1.endswith("NEC4 ACC_page_1.pdf")
# Second call: page 3 → "NEC4 ACC_page_3.pdf"
output_path_3 = calls[1][0][2]
assert output_path_3.endswith("NEC4 ACC_page_3.pdf")
# Verify the directory was created
assert os.path.isdir(str(chunk_dir))
# ------------------------------------------------------------------ #
# Test 8: Metadata includes page_number and chunk_file_path for PDFs
# ------------------------------------------------------------------ #
def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path):
"""PDF metadata should include page_number and chunk_file_path."""
doc_id = str(uuid.uuid4())
chunk_dir = tmp_path / "document_chunk"
chunk_dir.mkdir()
mock_settings.document_chunk_path = str(chunk_dir)
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta, \
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse_by_page.return_value = [(2, "Page 2 content")]
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [
{"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"},
]
response = client.post(
"/api/v1/ingest",
files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
# Verify extract_metadata was called with page_numbers and chunk_file_paths
meta_call_kwargs = mock_meta.call_args[1]
assert "page_numbers" in meta_call_kwargs
assert meta_call_kwargs["page_numbers"] == [2]
assert "chunk_file_paths" in meta_call_kwargs
assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"]
# ------------------------------------------------------------------ #
# Test 9: Metadata does NOT include page_number for DOCX (None)
# ------------------------------------------------------------------ #
def test_docx_metadata_no_page_info(self, client, mock_settings):
"""DOCX metadata should have page_number=None (no page_numbers passed)."""
doc_id = str(uuid.uuid4())
with patch("app.services.rag.RAGService") as mock_rag_class, \
patch("app.core.config.get_settings", return_value=mock_settings), \
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = doc_id
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
mock_parse.return_value = "DOCX content"
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1"]
mock_chunk_class.return_value = mock_chunker
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
# Verify extract_metadata was called WITHOUT page_numbers
meta_call_kwargs = mock_meta.call_args[1]
assert meta_call_kwargs.get("page_numbers") is None
assert meta_call_kwargs.get("chunk_file_paths") is None