166 lines
5.2 KiB
Python
166 lines
5.2 KiB
Python
"""Documents CRUD router for RAG Database management."""
|
|
import logging
|
|
import os
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
from fastapi.responses import FileResponse
|
|
|
|
from app.models.documents import (
|
|
DocumentInfo,
|
|
ChunkInfo,
|
|
DocumentListResponse,
|
|
DeleteResponse,
|
|
)
|
|
from app.services.rag import RAGService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(tags=["documents"])
|
|
|
|
|
|
@router.get("/documents", response_model=DocumentListResponse)
|
|
async def list_documents():
|
|
from app.core.config import get_settings
|
|
|
|
settings = get_settings()
|
|
rag = RAGService(settings=settings)
|
|
|
|
doc_list, total_docs, total_chunks = rag.list_documents()
|
|
|
|
documents = [
|
|
DocumentInfo(
|
|
document_id=d["document_id"],
|
|
filename=d["filename"],
|
|
chunk_count=d["chunk_count"],
|
|
upload_date=d["upload_date"],
|
|
chunking_strategy=d.get("chunking_strategy", "token"),
|
|
)
|
|
for d in doc_list
|
|
]
|
|
|
|
return DocumentListResponse(
|
|
documents=documents,
|
|
total_documents=total_docs,
|
|
total_chunks=total_chunks,
|
|
)
|
|
|
|
|
|
@router.get("/documents/{document_id}/chunks", response_model=list[ChunkInfo])
|
|
async def list_chunks(document_id: str):
|
|
from app.core.config import get_settings
|
|
|
|
settings = get_settings()
|
|
rag = RAGService(settings=settings)
|
|
|
|
chunks = rag.list_chunks(document_id)
|
|
|
|
return [
|
|
ChunkInfo(
|
|
chunk_id=c["chunk_id"],
|
|
chunk_index=c["chunk_index"],
|
|
content_summary=c["content_summary"],
|
|
page_number=c.get("page_number"),
|
|
chunk_file_path=c.get("chunk_file_path"),
|
|
strategy_type=c.get("strategy_type"),
|
|
question_index=c.get("question_index"),
|
|
question_id=c.get("question_id"),
|
|
question_text=c.get("question_text"),
|
|
section_heading=c.get("section_heading"),
|
|
answer_contains_table=c.get("answer_contains_table"),
|
|
source_page_range=c.get("source_page_range"),
|
|
parent_topic=c.get("parent_topic"),
|
|
)
|
|
for c in chunks
|
|
]
|
|
|
|
|
|
def _remove_chunk_file(file_path: str, base_dir: str) -> None:
|
|
"""Remove a chunk file from disk. Logs warning if file is missing."""
|
|
full_path = os.path.join(base_dir, file_path)
|
|
try:
|
|
os.remove(full_path)
|
|
logger.info("Removed chunk file: %s", full_path)
|
|
except FileNotFoundError:
|
|
logger.warning("Chunk file not found, skipping: %s", full_path)
|
|
|
|
|
|
@router.delete("/documents/{document_id}", response_model=DeleteResponse)
|
|
async def delete_document(document_id: str):
|
|
from app.core.config import get_settings
|
|
|
|
settings = get_settings()
|
|
rag = RAGService(settings=settings)
|
|
|
|
# Collect chunk file paths before ChromaDB deletion (metadata lost after delete)
|
|
chunks = rag.list_chunks(document_id)
|
|
chunk_paths = [
|
|
c["chunk_file_path"] for c in chunks if c.get("chunk_file_path")
|
|
]
|
|
|
|
for rel_path in chunk_paths:
|
|
_remove_chunk_file(rel_path, settings.document_chunk_path)
|
|
|
|
success, deleted_count = rag.delete_document(document_id)
|
|
|
|
if not success:
|
|
raise HTTPException(status_code=404, detail=f"Document not found: {document_id}")
|
|
|
|
logger.info("Deleted document %s: %d chunks removed", document_id, deleted_count)
|
|
|
|
return DeleteResponse(
|
|
deleted=True,
|
|
message=f"Deleted document {document_id}: {deleted_count} chunks removed",
|
|
)
|
|
|
|
|
|
@router.delete("/chunks/{chunk_id}", response_model=DeleteResponse)
|
|
async def delete_chunk(chunk_id: str):
|
|
from app.core.config import get_settings
|
|
|
|
settings = get_settings()
|
|
rag = RAGService(settings=settings)
|
|
|
|
# Get chunk metadata before ChromaDB deletion (metadata lost after delete)
|
|
result = rag.collection.get(ids=[chunk_id], include=["metadatas"])
|
|
metadatas = result.get("metadatas", [])
|
|
if metadatas and metadatas[0].get("chunk_file_path"):
|
|
_remove_chunk_file(metadatas[0]["chunk_file_path"], settings.document_chunk_path)
|
|
|
|
success = rag.delete_chunk(chunk_id)
|
|
|
|
if not success:
|
|
raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}")
|
|
|
|
logger.info("Deleted chunk: %s", chunk_id)
|
|
|
|
return DeleteResponse(
|
|
deleted=True,
|
|
message=f"Deleted chunk {chunk_id}",
|
|
)
|
|
|
|
|
|
@router.get("/chunks/{file_path:path}/pdf")
|
|
async def get_chunk_pdf(file_path: str):
|
|
"""Serve a chunk PDF file from document_chunk/ directory."""
|
|
from app.core.config import get_settings
|
|
|
|
# Path traversal protection
|
|
if ".." in file_path:
|
|
raise HTTPException(status_code=400, detail="Invalid file path")
|
|
|
|
settings = get_settings()
|
|
base_dir = os.path.realpath(settings.document_chunk_path)
|
|
full_path = os.path.realpath(os.path.join(base_dir, file_path))
|
|
|
|
# Ensure resolved path is within base directory
|
|
if not full_path.startswith(base_dir + os.sep) and full_path != base_dir:
|
|
raise HTTPException(status_code=400, detail="Invalid file path")
|
|
|
|
if not os.path.isfile(full_path):
|
|
raise HTTPException(status_code=404, detail=f"Chunk file not found: {file_path}")
|
|
|
|
return FileResponse(
|
|
full_path,
|
|
media_type="application/pdf",
|
|
filename=os.path.basename(full_path),
|
|
)
|