feat(backend): clean up chunk PDFs on document and chunk deletion
Delete document endpoint now removes associated chunk PDF files from document_chunk/ before ChromaDB deletion. Delete chunk endpoint removes individual chunk PDF. Missing files logged as warnings, not errors. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
b2dd385443
commit
4732b4949c
|
|
@ -1,5 +1,6 @@
|
|||
"""Documents CRUD router for RAG Database management."""
|
||||
import logging
|
||||
import os
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
|
|
@ -62,6 +63,16 @@ async def list_chunks(document_id: str):
|
|||
]
|
||||
|
||||
|
||||
def _remove_chunk_file(file_path: str, base_dir: str) -> None:
|
||||
"""Remove a chunk file from disk. Logs warning if file is missing."""
|
||||
full_path = os.path.join(base_dir, file_path)
|
||||
try:
|
||||
os.remove(full_path)
|
||||
logger.info("Removed chunk file: %s", full_path)
|
||||
except FileNotFoundError:
|
||||
logger.warning("Chunk file not found, skipping: %s", full_path)
|
||||
|
||||
|
||||
@router.delete("/documents/{document_id}", response_model=DeleteResponse)
|
||||
async def delete_document(document_id: str):
|
||||
from app.core.config import get_settings
|
||||
|
|
@ -69,6 +80,15 @@ async def delete_document(document_id: str):
|
|||
settings = get_settings()
|
||||
rag = RAGService(settings=settings)
|
||||
|
||||
# Collect chunk file paths before ChromaDB deletion (metadata lost after delete)
|
||||
chunks = rag.list_chunks(document_id)
|
||||
chunk_paths = [
|
||||
c["chunk_file_path"] for c in chunks if c.get("chunk_file_path")
|
||||
]
|
||||
|
||||
for rel_path in chunk_paths:
|
||||
_remove_chunk_file(rel_path, settings.document_chunk_path)
|
||||
|
||||
success, deleted_count = rag.delete_document(document_id)
|
||||
|
||||
if not success:
|
||||
|
|
@ -89,6 +109,12 @@ async def delete_chunk(chunk_id: str):
|
|||
settings = get_settings()
|
||||
rag = RAGService(settings=settings)
|
||||
|
||||
# Get chunk metadata before ChromaDB deletion (metadata lost after delete)
|
||||
result = rag.collection.get(ids=[chunk_id], include=["metadatas"])
|
||||
metadatas = result.get("metadatas", [])
|
||||
if metadatas and metadatas[0].get("chunk_file_path"):
|
||||
_remove_chunk_file(metadatas[0]["chunk_file_path"], settings.document_chunk_path)
|
||||
|
||||
success = rag.delete_chunk(chunk_id)
|
||||
|
||||
if not success:
|
||||
|
|
|
|||
Loading…
Reference in New Issue