"""Documents CRUD router for RAG Database management.""" import logging import os from fastapi import APIRouter, HTTPException from fastapi.responses import FileResponse from app.models.documents import ( DocumentInfo, ChunkInfo, DocumentListResponse, DeleteResponse, ) from app.services.rag import RAGService logger = logging.getLogger(__name__) router = APIRouter(tags=["documents"]) @router.get("/documents", response_model=DocumentListResponse) async def list_documents(): from app.core.config import get_settings settings = get_settings() rag = RAGService(settings=settings) doc_list, total_docs, total_chunks = rag.list_documents() documents = [ DocumentInfo( document_id=d["document_id"], filename=d["filename"], chunk_count=d["chunk_count"], upload_date=d["upload_date"], ) for d in doc_list ] return DocumentListResponse( documents=documents, total_documents=total_docs, total_chunks=total_chunks, ) @router.get("/documents/{document_id}/chunks", response_model=list[ChunkInfo]) async def list_chunks(document_id: str): from app.core.config import get_settings settings = get_settings() rag = RAGService(settings=settings) chunks = rag.list_chunks(document_id) return [ ChunkInfo( chunk_id=c["chunk_id"], chunk_index=c["chunk_index"], content_summary=c["content_summary"], page_number=c.get("page_number"), chunk_file_path=c.get("chunk_file_path"), ) for c in chunks ] def _remove_chunk_file(file_path: str, base_dir: str) -> None: """Remove a chunk file from disk. Logs warning if file is missing.""" full_path = os.path.join(base_dir, file_path) try: os.remove(full_path) logger.info("Removed chunk file: %s", full_path) except FileNotFoundError: logger.warning("Chunk file not found, skipping: %s", full_path) @router.delete("/documents/{document_id}", response_model=DeleteResponse) async def delete_document(document_id: str): from app.core.config import get_settings settings = get_settings() rag = RAGService(settings=settings) # Collect chunk file paths before ChromaDB deletion (metadata lost after delete) chunks = rag.list_chunks(document_id) chunk_paths = [ c["chunk_file_path"] for c in chunks if c.get("chunk_file_path") ] for rel_path in chunk_paths: _remove_chunk_file(rel_path, settings.document_chunk_path) success, deleted_count = rag.delete_document(document_id) if not success: raise HTTPException(status_code=404, detail=f"Document not found: {document_id}") logger.info("Deleted document %s: %d chunks removed", document_id, deleted_count) return DeleteResponse( deleted=True, message=f"Deleted document {document_id}: {deleted_count} chunks removed", ) @router.delete("/chunks/{chunk_id}", response_model=DeleteResponse) async def delete_chunk(chunk_id: str): from app.core.config import get_settings settings = get_settings() rag = RAGService(settings=settings) # Get chunk metadata before ChromaDB deletion (metadata lost after delete) result = rag.collection.get(ids=[chunk_id], include=["metadatas"]) metadatas = result.get("metadatas", []) if metadatas and metadatas[0].get("chunk_file_path"): _remove_chunk_file(metadatas[0]["chunk_file_path"], settings.document_chunk_path) success = rag.delete_chunk(chunk_id) if not success: raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}") logger.info("Deleted chunk: %s", chunk_id) return DeleteResponse( deleted=True, message=f"Deleted chunk {chunk_id}", ) @router.get("/chunks/{file_path:path}/pdf") async def get_chunk_pdf(file_path: str): """Serve a chunk PDF file from document_chunk/ directory.""" from app.core.config import get_settings # Path traversal protection if ".." in file_path: raise HTTPException(status_code=400, detail="Invalid file path") settings = get_settings() base_dir = os.path.realpath(settings.document_chunk_path) full_path = os.path.realpath(os.path.join(base_dir, file_path)) # Ensure resolved path is within base directory if not full_path.startswith(base_dir + os.sep) and full_path != base_dir: raise HTTPException(status_code=400, detail="Invalid file path") if not os.path.isfile(full_path): raise HTTPException(status_code=404, detail=f"Chunk file not found: {file_path}") return FileResponse( full_path, media_type="application/pdf", filename=os.path.basename(full_path), )