legco_ai_assistant/backend/app/routers/ingest.py

71 lines
2.1 KiB
Python

"""Document ingestion router."""
import os
import tempfile
import uuid
from pathlib import Path
from fastapi import APIRouter, UploadFile, File, HTTPException
from app.models.ingest import IngestResponse
router = APIRouter(tags=["ingest"])
SUPPORTED_EXTENSIONS = {".pdf", ".docx"}
@router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)):
"""Ingest a document into the RAG system.
Accepts PDF and DOCX files, parses text, chunks, extracts metadata,
embeds, and stores in ChromaDB.
"""
from app.services.rag import RAGService
from app.utils.chunking import TokenChunkingStrategy
from app.utils.metadata import extract_metadata
file_ext = Path(file.filename or "").suffix.lower()
if file_ext not in SUPPORTED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}",
)
temp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
content = await file.read()
tmp.write(content)
temp_path = tmp.name
if file_ext == ".pdf":
from app.utils.pdf_parser import parse_pdf
text = parse_pdf(temp_path)
elif file_ext == ".docx":
from app.utils.docx_parser import parse_docx
text = parse_docx(temp_path)
else:
text = ""
chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200)
chunks = chunker.chunk(text)
metadata = extract_metadata(temp_path, chunks)
rag = RAGService()
document_id = rag.ingest_document(temp_path, chunks, metadata)
return IngestResponse(
document_id=document_id,
chunk_count=len(chunks),
filename=file.filename or "unknown",
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
finally:
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)