diff --git a/backend/app/routers/ingest.py b/backend/app/routers/ingest.py index 63dee14..deb4495 100644 --- a/backend/app/routers/ingest.py +++ b/backend/app/routers/ingest.py @@ -59,7 +59,7 @@ async def ingest_document(file: UploadFile = File(...)): if not chunks: raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed") - metadata = extract_metadata(temp_path, chunks) + metadata = extract_metadata(temp_path, chunks, original_filename=filename) rag = RAGService(settings=settings) document_id = rag.ingest_document(temp_path, chunks, metadata) diff --git a/backend/app/utils/metadata.py b/backend/app/utils/metadata.py index 1bef64c..4eaabb0 100644 --- a/backend/app/utils/metadata.py +++ b/backend/app/utils/metadata.py @@ -5,11 +5,11 @@ from datetime import datetime from typing import List, Dict, Any -def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]: +def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. For each chunk, create a metadata dictionary containing: - - filename: basename of the provided file_path + - filename: basename of the provided file_path (or original_filename if provided) - upload_date: ISO 8601 timestamp of when metadata was generated - content_summary: first 200 characters of the chunk (or full chunk if shorter) - chunk_index: 0-based index of the chunk @@ -17,6 +17,8 @@ def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]: Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. + original_filename: Override filename stored in metadata when file_path + is a temp file. Returns: A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list. @@ -33,7 +35,7 @@ def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]: if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - filename = os.path.basename(file_path) + filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = []