82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
"""Document ingestion router."""
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, UploadFile, File, HTTPException
|
|
|
|
from app.models.ingest import IngestResponse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(tags=["ingest"])
|
|
|
|
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}
|
|
|
|
|
|
@router.post("/ingest", response_model=IngestResponse)
|
|
async def ingest_document(file: UploadFile = File(...)):
|
|
"""Ingest a document into the RAG system."""
|
|
from app.services.rag import RAGService
|
|
from app.utils.chunking import TokenChunkingStrategy
|
|
from app.utils.metadata import extract_metadata
|
|
|
|
filename = file.filename or "unknown"
|
|
file_ext = Path(filename).suffix.lower()
|
|
|
|
if file_ext not in SUPPORTED_EXTENSIONS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
|
|
)
|
|
|
|
temp_path = None
|
|
try:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
|
|
content = await file.read()
|
|
tmp.write(content)
|
|
temp_path = tmp.name
|
|
|
|
logger.info("Ingesting file: %s (%d bytes)", filename, len(content))
|
|
|
|
if file_ext == ".pdf":
|
|
from app.utils.pdf_parser import parse_pdf
|
|
text = parse_pdf(temp_path)
|
|
elif file_ext == ".docx":
|
|
from app.utils.docx_parser import parse_docx
|
|
text = parse_docx(temp_path)
|
|
elif file_ext == ".txt":
|
|
with open(temp_path, "r", encoding="utf-8") as f:
|
|
text = f.read()
|
|
else:
|
|
text = ""
|
|
|
|
chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200)
|
|
chunks = chunker.chunk(text)
|
|
|
|
if not chunks:
|
|
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
|
|
|
|
metadata = extract_metadata(temp_path, chunks)
|
|
|
|
rag = RAGService()
|
|
document_id = rag.ingest_document(temp_path, chunks, metadata)
|
|
|
|
logger.info("Ingested %s: %d chunks, doc_id=%s", filename, len(chunks), document_id)
|
|
|
|
return IngestResponse(
|
|
document_id=document_id,
|
|
chunk_count=len(chunks),
|
|
filename=filename,
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Ingestion failed for %s: %s", filename, str(e))
|
|
raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
|
|
|
|
finally:
|
|
if temp_path and os.path.exists(temp_path):
|
|
os.unlink(temp_path)
|