legco_ai_assistant/backend/app/utils/metadata.py

56 lines
1.9 KiB
Python

from __future__ import annotations
import os
from datetime import datetime
from typing import List, Dict, Any
def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]:
"""Extract metadata for a list of text chunks.
For each chunk, create a metadata dictionary containing:
- filename: basename of the provided file_path (or original_filename if provided)
- upload_date: ISO 8601 timestamp of when metadata was generated
- content_summary: first 200 characters of the chunk (or full chunk if shorter)
- chunk_index: 0-based index of the chunk
Args:
file_path: Path to the file associated with the chunks.
chunks: List of string chunks to generate metadata for.
original_filename: Override filename stored in metadata when file_path
is a temp file.
Returns:
A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
Raises:
FileNotFoundError: If the provided file_path does not exist.
"""
# Edge case: no chunks to metadataize
if not chunks:
return []
# Validate file existence up-front to follow the edge-case requirements
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
filename = original_filename if original_filename else os.path.basename(file_path)
upload_date = datetime.now().isoformat()
metadata: List[Dict[str, Any]] = []
for idx, chunk in enumerate(chunks):
# Ensure we always have a string for summary extraction
text = chunk if isinstance(chunk, str) else ""
content_summary = text[:200]
metadata.append(
{
"filename": filename,
"upload_date": upload_date,
"content_summary": content_summary,
"chunk_index": idx,
}
)
return metadata