legco_ai_assistant/backend/app/routers/ingest.py

"""Document ingestion router."""
import logging
import os
import tempfile
from pathlib import Path

from fastapi import APIRouter, UploadFile, File, HTTPException

from app.models.ingest import IngestResponse

logger = logging.getLogger(__name__)
router = APIRouter(tags=["ingest"])

SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}


@router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)):
    """Ingest a document into the RAG system."""
    from app.core.config import get_settings
    from app.services.rag import RAGService
    from app.utils.chunking import TokenChunkingStrategy
    from app.utils.metadata import extract_metadata

    filename = file.filename or "unknown"
    file_ext = Path(filename).suffix.lower()

    if file_ext not in SUPPORTED_EXTENSIONS:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
        )

    settings = get_settings()
    temp_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
            content = await file.read()
            tmp.write(content)
            temp_path = tmp.name

        logger.info("Ingesting file: %s (%d bytes)", filename, len(content))

        if file_ext == ".pdf":
            from app.utils.pdf_parser import parse_pdf
            text = parse_pdf(temp_path)
        elif file_ext == ".docx":
            from app.utils.docx_parser import parse_docx
            text = parse_docx(temp_path)
        elif file_ext == ".txt":
            with open(temp_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            text = ""

        chunker = TokenChunkingStrategy(chunk_size=settings.chunk_size, overlap=settings.chunk_overlap)
        chunks = chunker.chunk(text)

        if not chunks:
            raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")

        metadata = extract_metadata(temp_path, chunks, original_filename=filename)

        rag = RAGService(settings=settings)
        document_id = rag.ingest_document(temp_path, chunks, metadata)

        logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)

        return IngestResponse(
            document_id=document_id,
            chunk_count=len(chunks),
            filename=filename,
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error("Ingestion failed for %s: %s", filename, str(e))
        raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")

    finally:
        if temp_path and os.path.exists(temp_path):
            os.unlink(temp_path)