legco_ai_assistant/backend/app/routers/ingest.py

84 lines
2.7 KiB
Python

"""Document ingestion router."""
import logging
import os
import tempfile
from pathlib import Path
from fastapi import APIRouter, UploadFile, File, HTTPException
from app.models.ingest import IngestResponse
logger = logging.getLogger(__name__)
router = APIRouter(tags=["ingest"])
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
@router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)):
"""Ingest a document into the RAG system."""
from app.core.config import get_settings
from app.services.rag import RAGService
from app.utils.chunking import TokenChunkingStrategy
from app.utils.metadata import extract_metadata
filename = file.filename or "unknown"
file_ext = Path(filename).suffix.lower()
if file_ext not in SUPPORTED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
)
settings = get_settings()
temp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
content = await file.read()
tmp.write(content)
temp_path = tmp.name
logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
if file_ext == ".pdf":
from app.utils.pdf_parser import parse_pdf
text = parse_pdf(temp_path)
elif file_ext == ".docx":
from app.utils.docx_parser import parse_docx
text = parse_docx(temp_path)
elif file_ext == ".txt":
with open(temp_path, "r", encoding="utf-8") as f:
text = f.read()
else:
text = ""
chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
chunks = chunker.chunk(text)
if not chunks:
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
metadata = extract_metadata(temp_path, chunks, original_filename=filename)
rag = RAGService(settings=settings)
document_id = rag.ingest_document(temp_path, chunks, metadata)
logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
return IngestResponse(
document_id=document_id,
chunk_count=len(chunks),
filename=filename,
)
except HTTPException:
raise
except Exception as e:
logger.error("Ingestion failed for %s: %s", filename, str(e))
raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
finally:
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)