From 4732b4949c07d3acec02aeeef664ecc0132b18ca Mon Sep 17 00:00:00 2001 From: Woody Date: Fri, 24 Apr 2026 10:53:34 +0800 Subject: [PATCH] feat(backend): clean up chunk PDFs on document and chunk deletion Delete document endpoint now removes associated chunk PDF files from document_chunk/ before ChromaDB deletion. Delete chunk endpoint removes individual chunk PDF. Missing files logged as warnings, not errors. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- backend/app/routers/documents.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/backend/app/routers/documents.py b/backend/app/routers/documents.py index a9d6240..ba16725 100644 --- a/backend/app/routers/documents.py +++ b/backend/app/routers/documents.py @@ -1,5 +1,6 @@ """Documents CRUD router for RAG Database management.""" import logging +import os from fastapi import APIRouter, HTTPException @@ -62,6 +63,16 @@ async def list_chunks(document_id: str): ] +def _remove_chunk_file(file_path: str, base_dir: str) -> None: + """Remove a chunk file from disk. Logs warning if file is missing.""" + full_path = os.path.join(base_dir, file_path) + try: + os.remove(full_path) + logger.info("Removed chunk file: %s", full_path) + except FileNotFoundError: + logger.warning("Chunk file not found, skipping: %s", full_path) + + @router.delete("/documents/{document_id}", response_model=DeleteResponse) async def delete_document(document_id: str): from app.core.config import get_settings @@ -69,6 +80,15 @@ async def delete_document(document_id: str): settings = get_settings() rag = RAGService(settings=settings) + # Collect chunk file paths before ChromaDB deletion (metadata lost after delete) + chunks = rag.list_chunks(document_id) + chunk_paths = [ + c["chunk_file_path"] for c in chunks if c.get("chunk_file_path") + ] + + for rel_path in chunk_paths: + _remove_chunk_file(rel_path, settings.document_chunk_path) + success, deleted_count = rag.delete_document(document_id) if not success: @@ -89,6 +109,12 @@ async def delete_chunk(chunk_id: str): settings = get_settings() rag = RAGService(settings=settings) + # Get chunk metadata before ChromaDB deletion (metadata lost after delete) + result = rag.collection.get(ids=[chunk_id], include=["metadatas"]) + metadatas = result.get("metadatas", []) + if metadatas and metadatas[0].get("chunk_file_path"): + _remove_chunk_file(metadatas[0]["chunk_file_path"], settings.document_chunk_path) + success = rag.delete_chunk(chunk_id) if not success: