"""Document ingestion router.""" import logging import os import tempfile from pathlib import Path from fastapi import APIRouter, UploadFile, File, HTTPException from app.models.ingest import IngestResponse logger = logging.getLogger(__name__) router = APIRouter(tags=["ingest"]) SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"} @router.post("/ingest", response_model=IngestResponse) async def ingest_document(file: UploadFile = File(...)): """Ingest a document into the RAG system.""" from app.core.config import get_settings from app.services.rag import RAGService from app.utils.chunking import TokenChunkingStrategy from app.utils.metadata import extract_metadata filename = file.filename or "unknown" file_ext = Path(filename).suffix.lower() if file_ext not in SUPPORTED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}", ) settings = get_settings() temp_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp: content = await file.read() tmp.write(content) temp_path = tmp.name logger.info("Ingesting file: %s (%d bytes)", filename, len(content)) if file_ext == ".pdf": from app.utils.pdf_parser import parse_pdf text = parse_pdf(temp_path) elif file_ext == ".docx": from app.utils.docx_parser import parse_docx text = parse_docx(temp_path) elif file_ext == ".txt": with open(temp_path, "r", encoding="utf-8") as f: text = f.read() else: text = "" chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap) chunks = chunker.chunk(text) if not chunks: raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed") metadata = extract_metadata(temp_path, chunks) rag = RAGService(settings=settings) document_id = rag.ingest_document(temp_path, chunks, metadata) logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id) return IngestResponse( document_id=document_id, chunk_count=len(chunks), filename=filename, ) except HTTPException: raise except Exception as e: logger.error("Ingestion failed for %s: %s", filename, str(e)) raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}") finally: if temp_path and os.path.exists(temp_path): os.unlink(temp_path)