fix(backend): preserve original filename in chunk metadata instead of temp file name

When uploading files, the backend passes them through NamedTemporaryFile, causing os.path.basename to return temp names like 'tmp90i7xqa8.pdf'. Added original_filename parameter to extract_metadata() so the actual upload filename is stored.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:14:58 +08:00
parent c10318b7f7
commit b48c23001e
2 changed files with 6 additions and 4 deletions

View File

@ -59,7 +59,7 @@ async def ingest_document(file: UploadFile = File(...)):
if not chunks:
raise HTTPException(status_code=400, detail="Document appears to be empty or could not be parsed")
metadata = extract_metadata(temp_path, chunks)
metadata = extract_metadata(temp_path, chunks, original_filename=filename)
rag = RAGService(settings=settings)
document_id = rag.ingest_document(temp_path, chunks, metadata)

View File

@ -5,11 +5,11 @@ from datetime import datetime
from typing import List, Dict, Any
def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]:
def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]:
"""Extract metadata for a list of text chunks.
For each chunk, create a metadata dictionary containing:
- filename: basename of the provided file_path
- filename: basename of the provided file_path (or original_filename if provided)
- upload_date: ISO 8601 timestamp of when metadata was generated
- content_summary: first 200 characters of the chunk (or full chunk if shorter)
- chunk_index: 0-based index of the chunk
@ -17,6 +17,8 @@ def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]:
Args:
file_path: Path to the file associated with the chunks.
chunks: List of string chunks to generate metadata for.
original_filename: Override filename stored in metadata when file_path
is a temp file.
Returns:
A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
@ -33,7 +35,7 @@ def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]:
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
filename = os.path.basename(file_path)
filename = original_filename if original_filename else os.path.basename(file_path)
upload_date = datetime.now().isoformat()
metadata: List[Dict[str, Any]] = []