feat(backend): refactor ingest pipeline for page-aware chunking with PDF generation
PDF uploads now use parse_pdf_by_page() -> chunk_pages() -> extract page PDFs -> enhanced metadata with page_number, chunk_file_path, and document_id. Same-filename replacement deletes old chunks and PDFs before re-ingest. DOCX/TXT keep original flat flow with document_id added. RAGService.ingest_document() accepts optional document_id parameter. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
8c84062996
commit
b2dd385443
|
|
@ -2,6 +2,7 @@
|
|||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
|
|
@ -14,6 +15,27 @@ router = APIRouter(tags=["ingest"])
|
|||
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
|
||||
|
||||
|
||||
def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
|
||||
"""Delete existing document with same filename from ChromaDB and chunk PDFs."""
|
||||
doc_list, _, _ = rag.list_documents()
|
||||
|
||||
existing = [d for d in doc_list if d["filename"] == filename]
|
||||
if not existing:
|
||||
return
|
||||
|
||||
for doc in existing:
|
||||
old_id = doc["document_id"]
|
||||
chunks_info = rag.list_chunks(old_id)
|
||||
for chunk in chunks_info:
|
||||
chunk_file = chunk.get("chunk_file_path")
|
||||
if chunk_file:
|
||||
full_path = os.path.join(chunk_dir, chunk_file)
|
||||
if os.path.exists(full_path):
|
||||
os.unlink(full_path)
|
||||
rag.delete_document(old_id)
|
||||
logger.info("Deleted existing document %s (filename=%s)", old_id, filename)
|
||||
|
||||
|
||||
@router.post("/ingest", response_model=IngestResponse)
|
||||
async def ingest_document(file: UploadFile = File(...)):
|
||||
"""Ingest a document into the RAG system."""
|
||||
|
|
@ -41,34 +63,99 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
|
||||
logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
|
||||
|
||||
rag = RAGService(settings=settings)
|
||||
chunk_dir = settings.document_chunk_path
|
||||
_delete_existing_document(rag, filename, chunk_dir)
|
||||
|
||||
document_id = str(uuid.uuid4())
|
||||
chunker = TokenChunkingStrategy(
|
||||
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
|
||||
)
|
||||
|
||||
if file_ext == ".pdf":
|
||||
from app.utils.pdf_parser import parse_pdf
|
||||
text = parse_pdf(temp_path)
|
||||
from app.utils.pdf_parser import parse_pdf_by_page
|
||||
|
||||
pages = parse_pdf_by_page(temp_path)
|
||||
|
||||
if not pages:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Document appears to be empty or could not be parsed",
|
||||
)
|
||||
|
||||
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
|
||||
chunk_texts = [text for text, _ in chunked]
|
||||
page_numbers = [pn for _, pn in chunked]
|
||||
|
||||
os.makedirs(chunk_dir, exist_ok=True)
|
||||
stem = Path(filename).stem
|
||||
chunk_file_paths: list[str | None] = []
|
||||
for page_num in page_numbers:
|
||||
from app.utils.pdf_extractor import extract_page_as_pdf
|
||||
|
||||
chunk_filename = f"{stem}_page_{page_num}.pdf"
|
||||
output_path = os.path.join(chunk_dir, chunk_filename)
|
||||
try:
|
||||
extract_page_as_pdf(temp_path, page_num, output_path)
|
||||
chunk_file_paths.append(chunk_filename)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Failed to extract page %d PDF for %s: %s",
|
||||
page_num, filename, exc,
|
||||
)
|
||||
chunk_file_paths.append(None)
|
||||
|
||||
metadata = extract_metadata(
|
||||
temp_path,
|
||||
chunk_texts,
|
||||
original_filename=filename,
|
||||
page_numbers=page_numbers,
|
||||
chunk_file_paths=chunk_file_paths,
|
||||
document_id=document_id,
|
||||
)
|
||||
|
||||
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
|
||||
|
||||
elif file_ext == ".docx":
|
||||
from app.utils.docx_parser import parse_docx
|
||||
|
||||
text = parse_docx(temp_path)
|
||||
chunks = chunker.chunk(text)
|
||||
|
||||
if not chunks:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Document appears to be empty or could not be parsed",
|
||||
)
|
||||
|
||||
metadata = extract_metadata(
|
||||
temp_path, chunks, original_filename=filename, document_id=document_id
|
||||
)
|
||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||
|
||||
elif file_ext == ".txt":
|
||||
with open(temp_path, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
else:
|
||||
text = ""
|
||||
|
||||
chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
|
||||
chunks = chunker.chunk(text)
|
||||
chunks = chunker.chunk(text)
|
||||
|
||||
if not chunks:
|
||||
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
|
||||
if not chunks:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Document appears to be empty or could not be parsed",
|
||||
)
|
||||
|
||||
metadata = extract_metadata(temp_path, chunks, original_filename=filename)
|
||||
metadata = extract_metadata(
|
||||
temp_path, chunks, original_filename=filename, document_id=document_id
|
||||
)
|
||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||
|
||||
rag = RAGService(settings=settings)
|
||||
document_id = rag.ingest_document(temp_path, chunks, metadata)
|
||||
|
||||
logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
|
||||
logger.info("Ingested %s: doc_id=%s", filename, document_id)
|
||||
|
||||
chunk_count = len(chunk_texts) if file_ext == ".pdf" else len(chunks)
|
||||
return IngestResponse(
|
||||
document_id=document_id,
|
||||
chunk_count=len(chunks),
|
||||
chunk_count=chunk_count,
|
||||
filename=filename,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -42,11 +42,12 @@ class RAGService:
|
|||
file_path: str,
|
||||
chunks: List[str],
|
||||
metadata_list: List[Dict[str, Any]],
|
||||
document_id: Optional[str] = None,
|
||||
) -> str:
|
||||
if not chunks:
|
||||
return ""
|
||||
|
||||
document_id = str(uuid.uuid4())
|
||||
document_id = document_id or str(uuid.uuid4())
|
||||
ids = [f"{document_id}_{i}" for i in range(len(chunks))]
|
||||
|
||||
self.collection.add(
|
||||
|
|
|
|||
|
|
@ -27,14 +27,15 @@ class TestIngest:
|
|||
with patch("app.services.rag.RAGService") as mock_rag_class:
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = "doc-123"
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
with patch("app.utils.pdf_parser.parse_pdf") as mock_parse:
|
||||
mock_parse.return_value = "Parsed PDF text content"
|
||||
with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
|
||||
mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
|
||||
|
||||
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"]
|
||||
mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
|
||||
with patch("app.utils.metadata.extract_metadata") as mock_meta:
|
||||
|
|
@ -43,10 +44,11 @@ class TestIngest:
|
|||
{"filename": "test.pdf", "chunk_index": 1},
|
||||
]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
|
@ -61,6 +63,7 @@ class TestIngest:
|
|||
with patch("app.services.rag.RAGService") as mock_rag_class:
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = "doc-456"
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
with patch("app.utils.docx_parser.parse_docx") as mock_parse:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,435 @@
|
|||
"""Phase 1.5.5c tests: Page-aware ingest router.
|
||||
|
||||
Covers:
|
||||
1. PDF upload triggers page-aware pipeline (parse_pdf_by_page, chunk_pages, extract_page_as_pdf)
|
||||
2. DOCX upload uses old pipeline with document_id
|
||||
3. TXT upload uses old pipeline with document_id
|
||||
4. Same-filename replacement: existing document found → old chunks + PDFs deleted
|
||||
5. Same-filename replacement: no existing document → no deletion
|
||||
6. Empty PDF (no pages with text) → 400 error
|
||||
7. Page PDFs saved to correct directory with correct naming
|
||||
8. Metadata includes page_number and chunk_file_path for PDF uploads
|
||||
9. Metadata does NOT include page_number for DOCX uploads (None)
|
||||
"""
|
||||
import io
|
||||
import os
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch, call
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
class TestPageAwareIngest:
|
||||
"""Page-aware document ingestion tests."""
|
||||
|
||||
@pytest.fixture
|
||||
def client(self):
|
||||
"""Create test client with mocked dependencies."""
|
||||
from app.main import app
|
||||
return TestClient(app)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_settings(self):
|
||||
"""Mock settings with document_chunk_path."""
|
||||
settings = MagicMock()
|
||||
settings.chunk_size = 1000
|
||||
settings.chunk_overlap = 200
|
||||
settings.document_chunk_path = "/tmp/test_document_chunk"
|
||||
return settings
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 1: PDF upload triggers page-aware pipeline
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_pdf_upload_uses_page_aware_pipeline(self, client, mock_settings):
|
||||
"""PDF should go through parse_pdf_by_page → chunk_pages → extract_page_as_pdf."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta, \
|
||||
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page, \
|
||||
patch("app.services.rag.RAGService.list_documents") as mock_list_docs:
|
||||
|
||||
# RAGService instance
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
mock_rag_class.list_documents = MagicMock(return_value=([], 0, 0))
|
||||
|
||||
# parse_pdf_by_page returns 2 pages
|
||||
mock_parse_by_page.return_value = [
|
||||
(1, "Page 1 text content"),
|
||||
(2, "Page 2 text content"),
|
||||
]
|
||||
|
||||
# chunk_pages returns one chunk per page
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk_pages.return_value = [
|
||||
("Page 1 text content", 1),
|
||||
("Page 2 text content", 2),
|
||||
]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
|
||||
# metadata
|
||||
mock_meta.return_value = [
|
||||
{"filename": "test.pdf", "chunk_index": 0, "page_number": 1},
|
||||
{"filename": "test.pdf", "chunk_index": 1, "page_number": 2},
|
||||
]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["chunk_count"] == 2
|
||||
assert data["filename"] == "test.pdf"
|
||||
|
||||
# Verify page-aware parsing was called
|
||||
mock_parse_by_page.assert_called_once()
|
||||
|
||||
# Verify chunk_pages was used (not chunk)
|
||||
mock_chunker.chunk_pages.assert_called_once()
|
||||
mock_chunker.chunk.assert_not_called()
|
||||
|
||||
# Verify extract_page_as_pdf was called for each page
|
||||
assert mock_extract_page.call_count == 2
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 2: DOCX upload uses old pipeline
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_docx_upload_uses_old_pipeline(self, client, mock_settings):
|
||||
"""DOCX should use parse_docx → chunk → metadata with document_id only."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta:
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_parse.return_value = "DOCX text content"
|
||||
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk.return_value = ["chunk 1"]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
|
||||
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["chunk_count"] == 1
|
||||
assert data["filename"] == "test.docx"
|
||||
|
||||
# Verify old pipeline: parse_docx → chunk (not chunk_pages)
|
||||
mock_parse.assert_called_once()
|
||||
mock_chunker.chunk.assert_called_once()
|
||||
mock_chunker.chunk_pages.assert_not_called()
|
||||
|
||||
# Verify extract_metadata was called with document_id
|
||||
meta_call = mock_meta.call_args
|
||||
assert meta_call[1].get("document_id") is not None or \
|
||||
(len(meta_call[0]) > 3 and meta_call[0][3] is not None) or \
|
||||
"document_id" in str(meta_call)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 3: TXT upload uses old pipeline
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_txt_upload_uses_old_pipeline(self, client, mock_settings):
|
||||
"""TXT should read file → chunk → metadata with document_id."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta:
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk.return_value = ["txt chunk"]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
|
||||
mock_meta.return_value = [{"filename": "notes.txt", "chunk_index": 0}]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("notes.txt", io.BytesIO(b"Text content here"), "text/plain")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["chunk_count"] == 1
|
||||
assert data["filename"] == "notes.txt"
|
||||
|
||||
mock_chunker.chunk.assert_called_once()
|
||||
mock_chunker.chunk_pages.assert_not_called()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 4: Same-filename replacement: existing document → deletion
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_same_filename_replacement_deletes_old(self, client, mock_settings, tmp_path):
|
||||
"""Uploading file with same filename should delete old chunks and chunk PDFs."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
old_doc_id = "old-doc-uuid-1234"
|
||||
chunk_dir = tmp_path / "document_chunk"
|
||||
chunk_dir.mkdir()
|
||||
old_pdf = chunk_dir / "test_page_3.pdf"
|
||||
old_pdf.write_text("old chunk pdf")
|
||||
|
||||
mock_settings.document_chunk_path = str(chunk_dir)
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta, \
|
||||
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
# list_documents returns existing document with same filename
|
||||
mock_rag.list_documents.return_value = (
|
||||
[{"document_id": old_doc_id, "filename": "test.pdf", "chunk_count": 3}],
|
||||
1, 3
|
||||
)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
# list_chunks returns chunk with file path
|
||||
mock_rag.list_chunks.return_value = [
|
||||
{"chunk_id": f"{old_doc_id}_0", "chunk_file_path": "test_page_3.pdf"},
|
||||
{"chunk_id": f"{old_doc_id}_1", "chunk_file_path": "test_page_4.pdf"},
|
||||
]
|
||||
|
||||
mock_parse_by_page.return_value = [(1, "New page text")]
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk_pages.return_value = [("New page text", 1)]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
mock_meta.return_value = [{"filename": "test.pdf", "chunk_index": 0}]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify delete_document was called for old doc
|
||||
mock_rag.delete_document.assert_called_once_with(old_doc_id)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 5: Same-filename replacement: no existing document
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_no_existing_document_no_deletion(self, client, mock_settings):
|
||||
"""Uploading new filename should NOT trigger any deletion."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta, \
|
||||
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_parse_by_page.return_value = [(1, "Page text")]
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk_pages.return_value = [("Page text", 1)]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
mock_meta.return_value = [{"filename": "newdoc.pdf", "chunk_index": 0}]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("newdoc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify NO deletion happened
|
||||
mock_rag.delete_document.assert_not_called()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 6: Empty PDF → 400 error
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_empty_pdf_returns_400(self, client, mock_settings):
|
||||
"""PDF with no extractable text should return 400."""
|
||||
with patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.services.rag.RAGService") as mock_rag_class:
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
# Empty PDF: no pages
|
||||
mock_parse_by_page.return_value = []
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("empty.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
assert "empty" in response.json()["detail"].lower()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 7: Page PDFs saved with correct naming
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_page_pdf_naming_convention(self, client, mock_settings, tmp_path):
|
||||
"""Chunk PDFs should be named {stem}_page_{N}.pdf with relative paths in metadata."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
chunk_dir = tmp_path / "document_chunk"
|
||||
chunk_dir.mkdir()
|
||||
mock_settings.document_chunk_path = str(chunk_dir)
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta, \
|
||||
patch("app.utils.pdf_extractor.extract_page_as_pdf") as mock_extract_page:
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_parse_by_page.return_value = [
|
||||
(1, "Page 1"),
|
||||
(3, "Page 3"), # page 2 was empty, skipped
|
||||
]
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk_pages.return_value = [
|
||||
("Page 1", 1),
|
||||
("Page 3", 3),
|
||||
]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
mock_meta.return_value = [
|
||||
{"filename": "NEC4 ACC.pdf", "chunk_index": 0},
|
||||
{"filename": "NEC4 ACC.pdf", "chunk_index": 1},
|
||||
]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("NEC4 ACC.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify extract_page_as_pdf called with correct naming
|
||||
calls = mock_extract_page.call_args_list
|
||||
assert len(calls) == 2
|
||||
|
||||
# First call: page 1 → "NEC4 ACC_page_1.pdf"
|
||||
output_path_1 = calls[0][0][2] # third positional arg = output_path
|
||||
assert output_path_1.endswith("NEC4 ACC_page_1.pdf")
|
||||
|
||||
# Second call: page 3 → "NEC4 ACC_page_3.pdf"
|
||||
output_path_3 = calls[1][0][2]
|
||||
assert output_path_3.endswith("NEC4 ACC_page_3.pdf")
|
||||
|
||||
# Verify the directory was created
|
||||
assert os.path.isdir(str(chunk_dir))
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 8: Metadata includes page_number and chunk_file_path for PDFs
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_pdf_metadata_includes_page_info(self, client, mock_settings, tmp_path):
|
||||
"""PDF metadata should include page_number and chunk_file_path."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
chunk_dir = tmp_path / "document_chunk"
|
||||
chunk_dir.mkdir()
|
||||
mock_settings.document_chunk_path = str(chunk_dir)
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse_by_page, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta, \
|
||||
patch("app.utils.pdf_extractor.extract_page_as_pdf"):
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_parse_by_page.return_value = [(2, "Page 2 content")]
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk_pages.return_value = [("Page 2 content", 2)]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
mock_meta.return_value = [
|
||||
{"filename": "doc.pdf", "chunk_index": 0, "page_number": 2, "chunk_file_path": "doc_page_2.pdf"},
|
||||
]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("doc.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify extract_metadata was called with page_numbers and chunk_file_paths
|
||||
meta_call_kwargs = mock_meta.call_args[1]
|
||||
assert "page_numbers" in meta_call_kwargs
|
||||
assert meta_call_kwargs["page_numbers"] == [2]
|
||||
assert "chunk_file_paths" in meta_call_kwargs
|
||||
assert meta_call_kwargs["chunk_file_paths"] == ["doc_page_2.pdf"]
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Test 9: Metadata does NOT include page_number for DOCX (None)
|
||||
# ------------------------------------------------------------------ #
|
||||
def test_docx_metadata_no_page_info(self, client, mock_settings):
|
||||
"""DOCX metadata should have page_number=None (no page_numbers passed)."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
with patch("app.services.rag.RAGService") as mock_rag_class, \
|
||||
patch("app.core.config.get_settings", return_value=mock_settings), \
|
||||
patch("app.utils.docx_parser.parse_docx") as mock_parse, \
|
||||
patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class, \
|
||||
patch("app.utils.metadata.extract_metadata") as mock_meta:
|
||||
|
||||
mock_rag = MagicMock()
|
||||
mock_rag.ingest_document.return_value = doc_id
|
||||
mock_rag.list_documents.return_value = ([], 0, 0)
|
||||
mock_rag_class.return_value = mock_rag
|
||||
|
||||
mock_parse.return_value = "DOCX content"
|
||||
mock_chunker = MagicMock()
|
||||
mock_chunker.chunk.return_value = ["chunk 1"]
|
||||
mock_chunk_class.return_value = mock_chunker
|
||||
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.docx", io.BytesIO(b"docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify extract_metadata was called WITHOUT page_numbers
|
||||
meta_call_kwargs = mock_meta.call_args[1]
|
||||
assert meta_call_kwargs.get("page_numbers") is None
|
||||
assert meta_call_kwargs.get("chunk_file_paths") is None
|
||||
Loading…
Reference in New Issue