legco_ai_assistant/backend/app/routers/ingest.py

"""Document ingestion router."""
import os
import tempfile
import uuid
from pathlib import Path

from fastapi import APIRouter, UploadFile, File, HTTPException

from app.models.ingest import IngestResponse

router = APIRouter(tags=["ingest"])

SUPPORTED_EXTENSIONS = {".pdf", ".docx"}


@router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)):
    """Ingest a document into the RAG system.

    Accepts PDF and DOCX files, parses text, chunks, extracts metadata,
    embeds, and stores in ChromaDB.
    """
    from app.services.rag import RAGService
    from app.utils.chunking import TokenChunkingStrategy
    from app.utils.metadata import extract_metadata

    file_ext = Path(file.filename or "").suffix.lower()

    if file_ext not in SUPPORTED_EXTENSIONS:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}",
        )

    temp_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
            content = await file.read()
            tmp.write(content)
            temp_path = tmp.name

        if file_ext == ".pdf":
            from app.utils.pdf_parser import parse_pdf
            text = parse_pdf(temp_path)
        elif file_ext == ".docx":
            from app.utils.docx_parser import parse_docx
            text = parse_docx(temp_path)
        else:
            text = ""

        chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200)
        chunks = chunker.chunk(text)

        metadata = extract_metadata(temp_path, chunks)

        rag = RAGService()
        document_id = rag.ingest_document(temp_path, chunks, metadata)

        return IngestResponse(
            document_id=document_id,
            chunk_count=len(chunks),
            filename=file.filename or "unknown",
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")

    finally:
        if temp_path and os.path.exists(temp_path):
            os.unlink(temp_path)