"""Document ingestion router.""" import os import tempfile import uuid from pathlib import Path from fastapi import APIRouter, UploadFile, File, HTTPException from app.models.ingest import IngestResponse router = APIRouter(tags=["ingest"]) SUPPORTED_EXTENSIONS = {".pdf", ".docx"} @router.post("/ingest", response_model=IngestResponse) async def ingest_document(file: UploadFile = File(...)): """Ingest a document into the RAG system. Accepts PDF and DOCX files, parses text, chunks, extracts metadata, embeds, and stores in ChromaDB. """ from app.services.rag import RAGService from app.utils.chunking import TokenChunkingStrategy from app.utils.metadata import extract_metadata file_ext = Path(file.filename or "").suffix.lower() if file_ext not in SUPPORTED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}", ) temp_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp: content = await file.read() tmp.write(content) temp_path = tmp.name if file_ext == ".pdf": from app.utils.pdf_parser import parse_pdf text = parse_pdf(temp_path) elif file_ext == ".docx": from app.utils.docx_parser import parse_docx text = parse_docx(temp_path) else: text = "" chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200) chunks = chunker.chunk(text) metadata = extract_metadata(temp_path, chunks) rag = RAGService() document_id = rag.ingest_document(temp_path, chunks, metadata) return IngestResponse( document_id=document_id, chunk_count=len(chunks), filename=file.filename or "unknown", ) except Exception as e: raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}") finally: if temp_path and os.path.exists(temp_path): os.unlink(temp_path)