legco_ai_assistant/backend/app/routers/documents.py

129 lines
3.7 KiB
Python

"""Documents CRUD router for RAG Database management."""
import logging
import os
from fastapi import APIRouter, HTTPException
from app.models.documents import (
DocumentInfo,
ChunkInfo,
DocumentListResponse,
DeleteResponse,
)
from app.services.rag import RAGService
logger = logging.getLogger(__name__)
router = APIRouter(tags=["documents"])
@router.get("/documents", response_model=DocumentListResponse)
async def list_documents():
from app.core.config import get_settings
settings = get_settings()
rag = RAGService(settings=settings)
doc_list, total_docs, total_chunks = rag.list_documents()
documents = [
DocumentInfo(
document_id=d["document_id"],
filename=d["filename"],
chunk_count=d["chunk_count"],
upload_date=d["upload_date"],
)
for d in doc_list
]
return DocumentListResponse(
documents=documents,
total_documents=total_docs,
total_chunks=total_chunks,
)
@router.get("/documents/{document_id}/chunks", response_model=list[ChunkInfo])
async def list_chunks(document_id: str):
from app.core.config import get_settings
settings = get_settings()
rag = RAGService(settings=settings)
chunks = rag.list_chunks(document_id)
return [
ChunkInfo(
chunk_id=c["chunk_id"],
chunk_index=c["chunk_index"],
content_summary=c["content_summary"],
page_number=c.get("page_number"),
chunk_file_path=c.get("chunk_file_path"),
)
for c in chunks
]
def _remove_chunk_file(file_path: str, base_dir: str) -> None:
"""Remove a chunk file from disk. Logs warning if file is missing."""
full_path = os.path.join(base_dir, file_path)
try:
os.remove(full_path)
logger.info("Removed chunk file: %s", full_path)
except FileNotFoundError:
logger.warning("Chunk file not found, skipping: %s", full_path)
@router.delete("/documents/{document_id}", response_model=DeleteResponse)
async def delete_document(document_id: str):
from app.core.config import get_settings
settings = get_settings()
rag = RAGService(settings=settings)
# Collect chunk file paths before ChromaDB deletion (metadata lost after delete)
chunks = rag.list_chunks(document_id)
chunk_paths = [
c["chunk_file_path"] for c in chunks if c.get("chunk_file_path")
]
for rel_path in chunk_paths:
_remove_chunk_file(rel_path, settings.document_chunk_path)
success, deleted_count = rag.delete_document(document_id)
if not success:
raise HTTPException(status_code=404, detail=f"Document not found: {document_id}")
logger.info("Deleted document %s: %d chunks removed", document_id, deleted_count)
return DeleteResponse(
deleted=True,
message=f"Deleted document {document_id}: {deleted_count} chunks removed",
)
@router.delete("/chunks/{chunk_id}", response_model=DeleteResponse)
async def delete_chunk(chunk_id: str):
from app.core.config import get_settings
settings = get_settings()
rag = RAGService(settings=settings)
# Get chunk metadata before ChromaDB deletion (metadata lost after delete)
result = rag.collection.get(ids=[chunk_id], include=["metadatas"])
metadatas = result.get("metadatas", [])
if metadatas and metadatas[0].get("chunk_file_path"):
_remove_chunk_file(metadatas[0]["chunk_file_path"], settings.document_chunk_path)
success = rag.delete_chunk(chunk_id)
if not success:
raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}")
logger.info("Deleted chunk: %s", chunk_id)
return DeleteResponse(
deleted=True,
message=f"Deleted chunk {chunk_id}",
)