71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
"""Document ingestion router."""
|
|
import os
|
|
import tempfile
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, UploadFile, File, HTTPException
|
|
|
|
from app.models.ingest import IngestResponse
|
|
|
|
router = APIRouter(tags=["ingest"])
|
|
|
|
SUPPORTED_EXTENSIONS = {".pdf", ".docx"}
|
|
|
|
|
|
@router.post("/ingest", response_model=IngestResponse)
|
|
async def ingest_document(file: UploadFile = File(...)):
|
|
"""Ingest a document into the RAG system.
|
|
|
|
Accepts PDF and DOCX files, parses text, chunks, extracts metadata,
|
|
embeds, and stores in ChromaDB.
|
|
"""
|
|
from app.services.rag import RAGService
|
|
from app.utils.chunking import TokenChunkingStrategy
|
|
from app.utils.metadata import extract_metadata
|
|
|
|
file_ext = Path(file.filename or "").suffix.lower()
|
|
|
|
if file_ext not in SUPPORTED_EXTENSIONS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}",
|
|
)
|
|
|
|
temp_path = None
|
|
try:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
|
|
content = await file.read()
|
|
tmp.write(content)
|
|
temp_path = tmp.name
|
|
|
|
if file_ext == ".pdf":
|
|
from app.utils.pdf_parser import parse_pdf
|
|
text = parse_pdf(temp_path)
|
|
elif file_ext == ".docx":
|
|
from app.utils.docx_parser import parse_docx
|
|
text = parse_docx(temp_path)
|
|
else:
|
|
text = ""
|
|
|
|
chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200)
|
|
chunks = chunker.chunk(text)
|
|
|
|
metadata = extract_metadata(temp_path, chunks)
|
|
|
|
rag = RAGService()
|
|
document_id = rag.ingest_document(temp_path, chunks, metadata)
|
|
|
|
return IngestResponse(
|
|
document_id=document_id,
|
|
chunk_count=len(chunks),
|
|
filename=file.filename or "unknown",
|
|
)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
|
|
|
|
finally:
|
|
if temp_path and os.path.exists(temp_path):
|
|
os.unlink(temp_path)
|